[llvm] Rework i1->i32 zext/anyext translation (PR #114721)

via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 3 15:33:05 PST 2024


https://github.com/doraeneko updated https://github.com/llvm/llvm-project/pull/114721

>From 1cbb81fed5bc930b9cee1d0552c24ac9b84bde43 Mon Sep 17 00:00:00 2001
From: "Andreas Gaiser (doraeneko)" <gaiseras at gmail.com>
Date: Thu, 10 Oct 2024 04:11:43 +0200
Subject: [PATCH] Rework i1->i32 zext/anyext translation to distinguish uniform
 and divergent cases (#87938), similarly to sext_inreg handling.

adapt flat_atomic test case
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   28 +-
 llvm/test/CodeGen/AMDGPU/add.ll               |  122 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  152 +-
 llvm/test/CodeGen/AMDGPU/anyext.ll            |   22 +-
 .../atomic_optimizations_pixelshader.ll       |   82 +-
 ...der-no-live-segment-at-def-implicit-def.ll |   21 +-
 .../branch-folding-implicit-def-subreg.ll     |   18 +-
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |  476 +-
 .../test/CodeGen/AMDGPU/carryout-selection.ll |  389 +-
 .../CodeGen/AMDGPU/combine-add-zext-xor.ll    |  228 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |   40 +-
 llvm/test/CodeGen/AMDGPU/ctpop16.ll           |   40 +-
 llvm/test/CodeGen/AMDGPU/ctpop64.ll           |   42 +-
 .../CodeGen/AMDGPU/dagcombine-fma-crash.ll    |   12 +-
 .../CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll |   22 +-
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |   24 +-
 ...cannot-create-empty-or-backward-segment.ll |  109 +-
 .../expand-scalar-carry-out-select-user.ll    |   46 +-
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll |  360 +-
 .../CodeGen/AMDGPU/extract_vector_dynelt.ll   |   14 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  202 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll  | 6704 ++++++++++-------
 .../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 1878 +++--
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   26 +-
 llvm/test/CodeGen/AMDGPU/fp-classify.ll       |  270 +-
 llvm/test/CodeGen/AMDGPU/fp_to_sint.ll        |   20 +-
 llvm/test/CodeGen/AMDGPU/fp_to_uint.ll        |   20 +-
 llvm/test/CodeGen/AMDGPU/fptosi.f16.ll        |   24 +-
 llvm/test/CodeGen/AMDGPU/fptoui.f16.ll        |   20 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |  286 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |  556 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   32 +-
 .../CodeGen/AMDGPU/indirect-addressing-si.ll  |  451 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |   70 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |   37 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll  |   18 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll   |   18 +-
 .../AMDGPU/llvm.amdgcn.set.inactive.ll        |   19 +-
 llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll |  156 +-
 .../AMDGPU/long-branch-reserve-register.ll    |   51 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   32 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |   68 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               |  200 +-
 .../CodeGen/AMDGPU/optimize-negated-cond.ll   |  106 +-
 llvm/test/CodeGen/AMDGPU/or.ll                |   16 +-
 .../CodeGen/AMDGPU/private-memory-atomics.ll  |    8 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |   60 +-
 llvm/test/CodeGen/AMDGPU/saddo.ll             |  133 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |   52 +-
 llvm/test/CodeGen/AMDGPU/select-i32-zext.ll   |   42 +
 llvm/test/CodeGen/AMDGPU/select-undef.ll      |   36 +-
 llvm/test/CodeGen/AMDGPU/setcc-opt.ll         |   56 +-
 llvm/test/CodeGen/AMDGPU/setcc64.ll           |    4 +-
 llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll |   40 +-
 llvm/test/CodeGen/AMDGPU/sgpr-copy.ll         |   13 +-
 .../si-unify-exit-return-unreachable.ll       |  174 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |   95 +-
 .../spill-vgpr-to-agpr-update-regscavenger.ll |   11 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              |  408 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |   66 +-
 .../CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll     |    9 +-
 .../test/CodeGen/AMDGPU/trunc-cmp-constant.ll |    8 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  600 +-
 llvm/test/CodeGen/AMDGPU/uaddo.ll             |  183 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |   68 +-
 .../AMDGPU/undefined-subreg-liverange.ll      |   13 +-
 llvm/test/CodeGen/AMDGPU/uniform-cfg.ll       |    8 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |   60 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |   50 +-
 llvm/test/CodeGen/AMDGPU/usubo.ll             |  183 +-
 llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll       |   16 +-
 .../CodeGen/AMDGPU/vector-alloca-bitcast.ll   |   44 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   84 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |   68 +-
 llvm/test/CodeGen/AMDGPU/xor.ll               |   40 +-
 llvm/test/CodeGen/AMDGPU/zero_extend.ll       |    8 +-
 .../InferAddressSpaces/AMDGPU/flat_atomic.ll  |   32 +-
 77 files changed, 9595 insertions(+), 6604 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/select-i32-zext.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c8a46217190a1d..4d0fdc50a37070 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2343,14 +2343,30 @@ def : GCNPat <
                      /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0)
 >;
 
-class Ext32Pat <SDNode ext> : GCNPat <
-  (i32 (ext i1:$src0)),
-  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
-                     /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
+
+class UniformExt32<SDNode ext> : PatFrag<
+  (ops node:$src),
+  (i32 (ext $src)),
+  [{ return !N->isDivergent(); }]>;
+
+class DivergentExt32<SDNode ext> : PatFrag<
+  (ops node:$src),
+  (i32 (ext $src))>; 
+
+class UniformExt32Pat<SDNode ext> : GCNPat<
+  (UniformExt32<ext> SCC), 
+  (S_CSELECT_B32 (i32 1), (i32 0))  
 >;
 
-def : Ext32Pat <zext>;
-def : Ext32Pat <anyext>;
+class DivergentExt32Pat<SDNode ext> : GCNPat<
+  (DivergentExt32<ext> i1:$src),
+  (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                     /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src)>;
+
+def : UniformExt32Pat<zext>;
+def : UniformExt32Pat<anyext>;
+def : DivergentExt32Pat<zext>;
+def : DivergentExt32Pat<anyext>;
 
 // The multiplication scales from [0,1) to the unsigned integer range,
 // rounding down a bit to avoid unwanted overflow.
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 3c9d43a88a0fda..96d16ae968e1a2 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -1156,15 +1156,22 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
 ; GFX6-NEXT:    s_and_b64 vcc, exec, s[10:11]
-; GFX6-NEXT:    s_cbranch_vccz .LBB9_4
+; GFX6-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX6-NEXT:  ; %bb.1: ; %else
 ; GFX6-NEXT:    s_add_u32 s4, s4, s6
 ; GFX6-NEXT:    s_addc_u32 s5, s5, s7
-; GFX6-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GFX6-NEXT:    s_cbranch_vccnz .LBB9_3
-; GFX6-NEXT:  .LBB9_2: ; %if
+; GFX6-NEXT:    s_branch .LBB9_3
+; GFX6-NEXT:  .LBB9_2:
+; GFX6-NEXT:    s_mov_b64 s[8:9], -1
+; GFX6-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX6-NEXT:  .LBB9_3: ; %Flow
+; GFX6-NEXT:    s_and_b64 s[6:7], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX6-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX6-NEXT:  ; %bb.4: ; %if
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX6-NEXT:  .LBB9_3: ; %endif
+; GFX6-NEXT:  .LBB9_5: ; %endif
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
@@ -1172,9 +1179,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-; GFX6-NEXT:  .LBB9_4:
-; GFX6-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX6-NEXT:    s_branch .LBB9_2
 ;
 ; GFX8-LABEL: add64_in_branch:
 ; GFX8:       ; %bb.0: ; %entry
@@ -1182,15 +1186,22 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX8-NEXT:    s_cbranch_scc0 .LBB9_4
+; GFX8-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX8-NEXT:  ; %bb.1: ; %else
 ; GFX8-NEXT:    s_add_u32 s4, s4, s6
 ; GFX8-NEXT:    s_addc_u32 s5, s5, s7
-; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GFX8-NEXT:    s_cbranch_vccnz .LBB9_3
-; GFX8-NEXT:  .LBB9_2: ; %if
+; GFX8-NEXT:    s_branch .LBB9_3
+; GFX8-NEXT:  .LBB9_2:
+; GFX8-NEXT:    s_mov_b64 s[8:9], -1
+; GFX8-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX8-NEXT:  .LBB9_3: ; %Flow
+; GFX8-NEXT:    s_and_b64 s[6:7], s[8:9], exec
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX8-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX8-NEXT:  ; %bb.4: ; %if
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8-NEXT:  .LBB9_3: ; %endif
+; GFX8-NEXT:  .LBB9_5: ; %endif
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1198,9 +1209,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
-; GFX8-NEXT:  .LBB9_4:
-; GFX8-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX8-NEXT:    s_branch .LBB9_2
 ;
 ; GFX9-LABEL: add64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
@@ -1208,90 +1216,114 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_cbranch_scc0 .LBB9_4
+; GFX9-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
 ; GFX9-NEXT:    s_add_u32 s0, s8, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GFX9-NEXT:    s_cbranch_vccnz .LBB9_3
-; GFX9-NEXT:  .LBB9_2: ; %if
+; GFX9-NEXT:    s_branch .LBB9_3
+; GFX9-NEXT:  .LBB9_2:
+; GFX9-NEXT:    s_mov_b64 s[2:3], -1
+; GFX9-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX9-NEXT:  .LBB9_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX9-NEXT:  ; %bb.4: ; %if
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:  .LBB9_3: ; %endif
+; GFX9-NEXT:  .LBB9_5: ; %endif
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-; GFX9-NEXT:  .LBB9_4:
-; GFX9-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; GFX9-NEXT:    s_branch .LBB9_2
 ;
 ; GFX10-LABEL: add64_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB9_4
+; GFX10-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
 ; GFX10-NEXT:    s_add_u32 s0, s8, s10
 ; GFX10-NEXT:    s_addc_u32 s1, s9, s11
-; GFX10-NEXT:    s_cbranch_execnz .LBB9_3
-; GFX10-NEXT:  .LBB9_2: ; %if
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_branch .LBB9_3
+; GFX10-NEXT:  .LBB9_2:
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX10-NEXT:  .LBB9_3: ; %Flow
+; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX10-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX10-NEXT:  ; %bb.4: ; %if
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-NEXT:  .LBB9_3: ; %endif
+; GFX10-NEXT:  .LBB9_5: ; %endif
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
-; GFX10-NEXT:  .LBB9_4:
-; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; GFX10-NEXT:    s_branch .LBB9_2
 ;
 ; GFX11-LABEL: add64_in_branch:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB9_4
+; GFX11-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX11-NEXT:  ; %bb.1: ; %else
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
-; GFX11-NEXT:    s_cbranch_execnz .LBB9_3
-; GFX11-NEXT:  .LBB9_2: ; %if
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_branch .LBB9_3
+; GFX11-NEXT:  .LBB9_2:
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT:  .LBB9_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX11-NEXT:  ; %bb.4: ; %if
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT:  .LBB9_3: ; %endif
+; GFX11-NEXT:  .LBB9_5: ; %endif
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
-; GFX11-NEXT:  .LBB9_4:
-; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX11-NEXT:    s_branch .LBB9_2
 ;
 ; GFX12-LABEL: add64_in_branch:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX12-NEXT:    s_cbranch_scc0 .LBB9_4
+; GFX12-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX12-NEXT:  ; %bb.1: ; %else
 ; GFX12-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX12-NEXT:    s_cbranch_execnz .LBB9_3
-; GFX12-NEXT:  .LBB9_2: ; %if
+; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_branch .LBB9_3
+; GFX12-NEXT:  .LBB9_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT:  .LBB9_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX12-NEXT:  ; %bb.4: ; %if
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
-; GFX12-NEXT:  .LBB9_3: ; %endif
+; GFX12-NEXT:  .LBB9_5: ; %endif
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB9_4:
-; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX12-NEXT:    s_branch .LBB9_2
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 4d26453e1a0d6d..4688c7a6879bd5 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -557,31 +557,31 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_mul_hi_u32 s9, s0, s7
 ; GFX908-NEXT:    s_mul_i32 s0, s0, s7
 ; GFX908-NEXT:    s_add_i32 s1, s9, s1
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
+; GFX908-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
 ; GFX908-NEXT:  .LBB3_1: ; %Flow20
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
+; GFX908-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX908-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX908-NEXT:    s_cmp_lg_u32 s7, 1
+; GFX908-NEXT:    s_cbranch_scc0 .LBB3_14
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
+; GFX908-NEXT:    ; Child Loop BB3_6 Depth 2
 ; GFX908-NEXT:    s_mov_b64 s[16:17], -1
-; GFX908-NEXT:    s_cbranch_scc0 .LBB3_10
+; GFX908-NEXT:    s_cbranch_scc0 .LBB3_12
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
 ; GFX908-NEXT:    s_mov_b32 s9, s8
-; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
-; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
 ; GFX908-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v9, s9
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v4
@@ -596,18 +596,22 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_add_i32 s9, s20, s9
 ; GFX908-NEXT:    s_mul_i32 s7, s2, s7
 ; GFX908-NEXT:    s_add_i32 s9, s9, s21
-; GFX908-NEXT:    s_branch .LBB3_5
+; GFX908-NEXT:    s_branch .LBB3_6
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
-; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s18, s18, s14
+; GFX908-NEXT:    s_add_u32 s18, s18, s0
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT:    s_addc_u32 s19, s19, s15
+; GFX908-NEXT:    s_addc_u32 s19, s19, s1
 ; GFX908-NEXT:    s_mov_b64 s[20:21], 0
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
-; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
-; GFX908-NEXT:  .LBB3_5: ; %bb16
+; GFX908-NEXT:  .LBB3_5: ; %Flow18
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT:    s_and_b64 s[22:23], s[22:23], exec
+; GFX908-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX908-NEXT:    s_cmp_lg_u32 s22, 1
+; GFX908-NEXT:    s_cbranch_scc0 .LBB3_11
+; GFX908-NEXT:  .LBB3_6: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    s_add_u32 s20, s18, s7
@@ -622,11 +626,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    ds_read_b64 v[12:13], v19
 ; GFX908-NEXT:    ds_read_b64 v[14:15], v0
-; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_and_b64 s[20:21], s[16:17], exec
+; GFX908-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX908-NEXT:    s_cmp_lg_u32 s20, 1
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_cbranch_vccnz .LBB3_7
-; GFX908-NEXT:  ; %bb.6: ; %bb51
-; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    s_cbranch_scc1 .LBB3_8
+; GFX908-NEXT:  ; %bb.7: ; %bb51
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX908-NEXT:    v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; GFX908-NEXT:    v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -648,31 +654,37 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
-; GFX908-NEXT:    s_branch .LBB3_4
-; GFX908-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[16:17]
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
-; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    s_branch .LBB3_9
+; GFX908-NEXT:  .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT:  .LBB3_9: ; %Flow
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT:    s_and_b64 s[22:23], s[20:21], exec
+; GFX908-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX908-NEXT:    s_cmp_lg_u32 s22, 1
+; GFX908-NEXT:    s_cbranch_scc0 .LBB3_4
+; GFX908-NEXT:  ; %bb.10: ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
-; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
+; GFX908-NEXT:    s_branch .LBB3_5
+; GFX908-NEXT:  .LBB3_11: ; %loop.exit.guard
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
-; GFX908-NEXT:  .LBB3_10: ; %Flow19
+; GFX908-NEXT:  .LBB3_12: ; %Flow19
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_mov_b64 s[0:1], -1
+; GFX908-NEXT:    s_mov_b64 s[14:15], -1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
-; GFX908-NEXT:  ; %bb.11: ; %bb12
+; GFX908-NEXT:  ; %bb.13: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_add_u32 s4, s4, s6
 ; GFX908-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX908-NEXT:    s_add_u32 s10, s10, s12
 ; GFX908-NEXT:    s_addc_u32 s11, s11, s13
-; GFX908-NEXT:    s_mov_b64 s[0:1], 0
+; GFX908-NEXT:    s_mov_b64 s[14:15], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
-; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
+; GFX908-NEXT:  .LBB3_14: ; %DummyReturnBlock
 ; GFX908-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
@@ -720,28 +732,28 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_mul_hi_u32 s9, s0, s7
 ; GFX90A-NEXT:    s_mul_i32 s0, s0, s7
 ; GFX90A-NEXT:    s_add_i32 s1, s9, s1
-; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
+; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
 ; GFX90A-NEXT:  .LBB3_1: ; %Flow20
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
+; GFX90A-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GFX90A-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX90A-NEXT:    s_cmp_lg_u32 s7, 1
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_14
 ; GFX90A-NEXT:  .LBB3_2: ; %bb9
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB3_6 Depth 2
 ; GFX90A-NEXT:    s_mov_b64 s[16:17], -1
-; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_10
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_12
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
 ; GFX90A-NEXT:    s_mov_b32 s9, s8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
 ; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -755,18 +767,22 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_add_i32 s9, s20, s9
 ; GFX90A-NEXT:    s_mul_i32 s7, s2, s7
 ; GFX90A-NEXT:    s_add_i32 s9, s9, s21
-; GFX90A-NEXT:    s_branch .LBB3_5
+; GFX90A-NEXT:    s_branch .LBB3_6
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
-; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s18, s18, s14
-; GFX90A-NEXT:    s_addc_u32 s19, s19, s15
+; GFX90A-NEXT:    s_add_u32 s18, s18, s0
+; GFX90A-NEXT:    s_addc_u32 s19, s19, s1
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], 0
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
-; GFX90A-NEXT:    s_cbranch_vccz .LBB3_9
-; GFX90A-NEXT:  .LBB3_5: ; %bb16
+; GFX90A-NEXT:  .LBB3_5: ; %Flow18
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
+; GFX90A-NEXT:    s_and_b64 s[22:23], s[22:23], exec
+; GFX90A-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX90A-NEXT:    s_cmp_lg_u32 s22, 1
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_11
+; GFX90A-NEXT:  .LBB3_6: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    s_add_u32 s20, s18, s7
@@ -781,12 +797,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
 ; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
-; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX90A-NEXT:    ; kill: killed $sgpr20 killed $sgpr21
+; GFX90A-NEXT:    s_and_b64 s[20:21], s[16:17], exec
+; GFX90A-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX90A-NEXT:    s_cmp_lg_u32 s20, 1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_7
-; GFX90A-NEXT:  ; %bb.6: ; %bb51
-; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    s_cbranch_scc1 .LBB3_8
+; GFX90A-NEXT:  ; %bb.7: ; %bb51
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v22, v21
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -800,31 +818,37 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
-; GFX90A-NEXT:    s_branch .LBB3_4
-; GFX90A-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[16:17]
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
-; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    s_branch .LBB3_9
+; GFX90A-NEXT:  .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
+; GFX90A-NEXT:    s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT:  .LBB3_9: ; %Flow
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
+; GFX90A-NEXT:    s_and_b64 s[22:23], s[20:21], exec
+; GFX90A-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX90A-NEXT:    s_cmp_lg_u32 s22, 1
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_4
+; GFX90A-NEXT:  ; %bb.10: ; in Loop: Header=BB3_6 Depth=2
+; GFX90A-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
-; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
+; GFX90A-NEXT:    s_branch .LBB3_5
+; GFX90A-NEXT:  .LBB3_11: ; %loop.exit.guard
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
-; GFX90A-NEXT:  .LBB3_10: ; %Flow19
+; GFX90A-NEXT:  .LBB3_12: ; %Flow19
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_mov_b64 s[0:1], -1
+; GFX90A-NEXT:    s_mov_b64 s[14:15], -1
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
-; GFX90A-NEXT:  ; %bb.11: ; %bb12
+; GFX90A-NEXT:  ; %bb.13: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_add_u32 s4, s4, s6
 ; GFX90A-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX90A-NEXT:    s_add_u32 s10, s10, s12
 ; GFX90A-NEXT:    s_addc_u32 s11, s11, s13
-; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
+; GFX90A-NEXT:    s_mov_b64 s[14:15], 0
 ; GFX90A-NEXT:    s_branch .LBB3_1
-; GFX90A-NEXT:  .LBB3_12: ; %DummyReturnBlock
+; GFX90A-NEXT:  .LBB3_14: ; %DummyReturnBlock
 ; GFX90A-NEXT:    s_endpgm
 bb:
   %i = load volatile i16, ptr addrspace(4) undef, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 8b6c8be9f37882..67da40c27d8aaa 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -15,8 +15,8 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -28,10 +28,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_andn2_b32 s4, 1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -43,10 +42,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_andn2_b32 s4, 1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
@@ -155,7 +153,9 @@ define amdgpu_kernel void @anyext_v2i16_to_v2i32() #0 {
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x8001, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GCN-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 4ae08a0375c8c3..cc3ce805654fee 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -42,8 +42,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX7-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX7-NEXT:    s_cbranch_scc1 .LBB0_6
 ; GFX7-NEXT:  ; %bb.5: ; %if
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
@@ -77,8 +79,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX89-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX89-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX89-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX89-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX89-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX89-NEXT:    s_cbranch_scc1 .LBB0_6
 ; GFX89-NEXT:  ; %bb.5: ; %if
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX89-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
@@ -113,8 +117,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX1064-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1064-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1064-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB0_6
 ; GFX1064-NEXT:  ; %bb.5: ; %if
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
@@ -148,8 +154,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1032-NEXT:    s_wqm_b32 s4, -1
 ; GFX1032-NEXT:    s_and_b32 s4, s4, s4
-; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX1032-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1032-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1032-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB0_6
 ; GFX1032-NEXT:  ; %bb.5: ; %if
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
@@ -188,8 +196,11 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
-; GFX1164-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1164-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB0_6
 ; GFX1164-NEXT:  ; %bb.5: ; %if
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
@@ -227,8 +238,11 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1132-NEXT:    s_wqm_b32 s4, -1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_b32 s4, s4, s4
-; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_vccnz .LBB0_6
+; GFX1132-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1132-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB0_6
 ; GFX1132-NEXT:  ; %bb.5: ; %if
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
@@ -252,8 +266,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
 ; GFX7-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
-; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX7-NEXT:    s_and_b64 s[4:5], s[8:9], exec
+; GFX7-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX7-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -308,8 +324,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX8-NEXT:    s_cbranch_vccnz .LBB1_6
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX8-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GFX8-NEXT:  ; %bb.5: ; %if
 ; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX8-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
@@ -363,8 +381,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_vccnz .LBB1_6
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GFX9-NEXT:  ; %bb.5: ; %if
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 ; GFX9-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
@@ -423,8 +443,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GFX1064-NEXT:    s_cbranch_vccnz .LBB1_6
+; GFX1064-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1064-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1064-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GFX1064-NEXT:  ; %bb.5: ; %if
 ; GFX1064-NEXT:    buffer_store_dword v4, off, s[0:3], 0
 ; GFX1064-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
@@ -473,8 +495,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
 ; GFX1032-NEXT:    s_wqm_b32 s4, -1
 ; GFX1032-NEXT:    s_and_b32 s4, s4, s4
-; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
-; GFX1032-NEXT:    s_cbranch_vccnz .LBB1_6
+; GFX1032-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1032-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1032-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GFX1032-NEXT:  ; %bb.5: ; %if
 ; GFX1032-NEXT:    buffer_store_dword v4, off, s[0:3], 0
 ; GFX1032-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
@@ -543,8 +567,11 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], s[4:5]
-; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
-; GFX1164-NEXT:    s_cbranch_vccnz .LBB1_6
+; GFX1164-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1164-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GFX1164-NEXT:  ; %bb.5: ; %if
 ; GFX1164-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
 ; GFX1164-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
@@ -601,8 +628,11 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:    s_wqm_b32 s4, -1
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_b32 s4, s4, s4
-; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX1132-NEXT:    s_cbranch_vccnz .LBB1_6
+; GFX1132-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1132-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_6
 ; GFX1132-NEXT:  ; %bb.5: ; %if
 ; GFX1132-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
 ; GFX1132-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index 6dfc832ff3ac9f..af00f7d42018bd 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s40, 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB0_8
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_9
 ; CHECK-NEXT:  ; %bb.1: ; %if.end13.i.i
 ; CHECK-NEXT:    s_cmp_eq_u32 s42, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_4
@@ -28,16 +28,14 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    s_mov_b32 s36, 0
-; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_6
-; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:    s_branch .LBB0_6
 ; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:    s_branch .LBB0_8
 ; CHECK-NEXT:  .LBB0_5: ; %if.then263.i.i
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 s8, s41, 0
 ; CHECK-NEXT:    s_mov_b32 s36, 1.0
@@ -45,9 +43,12 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_mov_b32 s37, s36
 ; CHECK-NEXT:    s_mov_b32 s38, s36
 ; CHECK-NEXT:    s_mov_b32 s39, s36
-; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_7
-; CHECK-NEXT:  .LBB0_6: ; %if.end273.i.i
+; CHECK-NEXT:  .LBB0_6: ; %Flow
+; CHECK-NEXT:    s_and_b32 s8, s8, exec_lo
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_8
+; CHECK-NEXT:  ; %bb.7: ; %if.end273.i.i
 ; CHECK-NEXT:    s_add_u32 s8, s6, 40
 ; CHECK-NEXT:    s_addc_u32 s9, s7, 0
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
@@ -69,13 +70,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_mov_b32 s37, s36
 ; CHECK-NEXT:    s_mov_b32 s38, s36
 ; CHECK-NEXT:    s_mov_b32 s39, s36
-; CHECK-NEXT:  .LBB0_7: ; %if.end294.i.i
+; CHECK-NEXT:  .LBB0_8: ; %if.end294.i.i
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; CHECK-NEXT:  .LBB0_8: ; %kernel_direct_lighting.exit
+; CHECK-NEXT:  .LBB0_9: ; %kernel_direct_lighting.exit
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x20
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s36
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 055e9850de3d68..b732b725690300 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -318,8 +318,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.31, implicit $vcc
+  ; GFX90A-NEXT:   dead renamable $sgpr4_sgpr5 = S_AND_B64 killed renamable $sgpr34_sgpr35, $exec, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr4 = S_CSELECT_B32 1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 1, implicit-def $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.31, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.30.bb19:
   ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
@@ -911,8 +913,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.65(0x40000000), %bb.67(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.67, implicit $vcc
+  ; GFX90A-NEXT:   dead renamable $sgpr8_sgpr9 = S_AND_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr8 = S_CSELECT_B32 1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   S_CMP_LG_U32 killed renamable $sgpr8, 1, implicit-def $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.67, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.65.bb159:
   ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.66(0x40000000)
@@ -976,8 +980,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   successors: %bb.71(0x40000000), %bb.72(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
+  ; GFX90A-NEXT:   dead renamable $sgpr8_sgpr9 = S_AND_B64 killed renamable $sgpr8_sgpr9, $exec, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr8 = S_CSELECT_B32 1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   S_CMP_LG_U32 killed renamable $sgpr8, 1, implicit-def $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.72, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.71.bb186:
   ; GFX90A-NEXT:   successors: %bb.72(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 77f1bc2a172a50..ff9c7a796c1626 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -577,23 +577,33 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
 ; GCN-NEXT:    s_mov_b64 s[0:1], -1
-; GCN-NEXT:    s_cbranch_scc0 .LBB5_1
-; GCN-NEXT:  ; %bb.7: ; %bb0
-; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc5:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc5)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc5)>>32
-; GCN-NEXT:    s_setpc_b64 s[0:1]
-; GCN-NEXT:  .LBB5_1: ; %Flow
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN-NEXT:    s_cbranch_vccnz .LBB5_3
-; GCN-NEXT:  .LBB5_2: ; %bb2
+; GCN-NEXT:    s_cbranch_scc1 .LBB5_1
+; GCN-NEXT:  ; %bb.5: ; %bb0
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:  .Lpost_getpc4:
+; GCN-NEXT:    s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc4)&4294967295
+; GCN-NEXT:    s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc4)>>32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+; GCN-NEXT:  .LBB5_1: ; %bb3
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:  .LBB5_2: ; %Flow
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB5_4
+; GCN-NEXT:  ; %bb.3: ; %bb2
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 17
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB5_3: ; %bb4
+; GCN-NEXT:  .LBB5_4: ; %bb4
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
@@ -603,26 +613,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB5_4: ; %bb3
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_cbranch_execnz .LBB5_5
-; GCN-NEXT:  ; %bb.9: ; %bb3
-; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc6:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32
-; GCN-NEXT:    s_setpc_b64 s[0:1]
-; GCN-NEXT:  .LBB5_5: ; %bb3
-; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc4:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc4)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc4)>>32
-; GCN-NEXT:    s_setpc_b64 s[0:1]
 ;
 ; GFX11-LABEL: uniform_unconditional_min_long_forward_branch:
 ; GFX11:       ; %bb.0: ; %bb0
@@ -630,15 +620,34 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], -1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX11-NEXT:  ; %bb.5: ; %bb0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:  .Lpost_getpc5:
+; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX11-NEXT:    s_add_u32 s4, s4, (.LBB5_2-.Lpost_getpc5)&4294967295
+; GFX11-NEXT:    s_addc_u32 s5, s5, (.LBB5_2-.Lpost_getpc5)>>32
+; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX11-NEXT:    s_setpc_b64 s[4:5]
+; GFX11-NEXT:  .LBB5_1: ; %bb3
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    v_nop_e64
+; GFX11-NEXT:    v_nop_e64
+; GFX11-NEXT:    v_nop_e64
+; GFX11-NEXT:    v_nop_e64
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:  .LBB5_2: ; %Flow
+; GFX11-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[0:1]
-; GFX11-NEXT:    s_cbranch_vccnz .LBB5_3
-; GFX11-NEXT:  .LBB5_2: ; %bb2
+; GFX11-NEXT:  ; %bb.3: ; %bb2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 17
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:  .LBB5_3: ; %bb4
+; GFX11-NEXT:  .LBB5_4: ; %bb4
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 63
@@ -646,30 +655,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_endpgm
-; GFX11-NEXT:  .LBB5_4: ; %bb3
-; GFX11-NEXT:    ;;#ASMSTART
-; GFX11-NEXT:    v_nop_e64
-; GFX11-NEXT:    v_nop_e64
-; GFX11-NEXT:    v_nop_e64
-; GFX11-NEXT:    v_nop_e64
-; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_cbranch_execnz .LBB5_5
-; GFX11-NEXT:  ; %bb.7: ; %bb3
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc6:
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_setpc_b64 s[0:1]
-; GFX11-NEXT:  .LBB5_5: ; %bb3
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc5:
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc5)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc5)>>32
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ;
 ; GFX12-LABEL: uniform_unconditional_min_long_forward_branch:
 ; GFX12:       ; %bb.0: ; %bb0
@@ -677,53 +662,40 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX12-NEXT:    s_mov_b32 s0, -1
-; GFX12-NEXT:    s_cbranch_scc0 .LBB5_1
-; GFX12-NEXT:  ; %bb.7: ; %bb0
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc6:
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT:  ; %bb.5: ; %bb0
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:  .Lpost_getpc5:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB5_4-.Lpost_getpc6)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB5_4-.Lpost_getpc6)>>32
+; GFX12-NEXT:    s_add_co_u32 s4, s4, (.LBB5_2-.Lpost_getpc5)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, (.LBB5_2-.Lpost_getpc5)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[0:1]
-; GFX12-NEXT:  .LBB5_1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_vccnz .LBB5_3
-; GFX12-NEXT:  .LBB5_2: ; %bb2
+; GFX12-NEXT:    s_setpc_b64 s[4:5]
+; GFX12-NEXT:  .LBB5_1: ; %bb3
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    ;;#ASMSTART
+; GFX12-NEXT:    v_nop_e64
+; GFX12-NEXT:    v_nop_e64
+; GFX12-NEXT:    v_nop_e64
+; GFX12-NEXT:    v_nop_e64
+; GFX12-NEXT:    ;;#ASMEND
+; GFX12-NEXT:  .LBB5_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX12-NEXT:  ; %bb.3: ; %bb2
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 17
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:  .LBB5_3: ; %bb4
+; GFX12-NEXT:  .LBB5_4: ; %bb4
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB5_4: ; %bb3
-; GFX12-NEXT:    ;;#ASMSTART
-; GFX12-NEXT:    v_nop_e64
-; GFX12-NEXT:    v_nop_e64
-; GFX12-NEXT:    v_nop_e64
-; GFX12-NEXT:    v_nop_e64
-; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_cbranch_execnz .LBB5_5
-; GFX12-NEXT:  ; %bb.9: ; %bb3
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc7:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB5_2-.Lpost_getpc7)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB5_2-.Lpost_getpc7)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[0:1]
-; GFX12-NEXT:  .LBB5_5: ; %bb3
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc5:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB5_3-.Lpost_getpc5)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB5_3-.Lpost_getpc5)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[0:1]
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -763,9 +735,9 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad
 ; GCN-NEXT:  ; %bb.3: ; %loop
 ; GCN-NEXT:    ; in Loop: Header=BB6_1 Depth=1
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc7:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32
+; GCN-NEXT:  .Lpost_getpc5:
+; GCN-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc5)&4294967295
+; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc5)>>32
 ; GCN-NEXT:    s_setpc_b64 s[0:1]
 ; GCN-NEXT:  .LBB6_2: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm
@@ -786,10 +758,10 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad
 ; GFX11-NEXT:  ; %bb.3: ; %loop
 ; GFX11-NEXT:    ; in Loop: Header=BB6_1 Depth=1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc7:
+; GFX11-NEXT:  .Lpost_getpc6:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc6)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc6)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB6_2: ; %DummyReturnBlock
@@ -810,10 +782,10 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad
 ; GFX12-NEXT:  ; %bb.3: ; %loop
 ; GFX12-NEXT:    ; in Loop: Header=BB6_1 Depth=1
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc8:
+; GFX12-NEXT:  .Lpost_getpc6:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB6_1-.Lpost_getpc8)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB6_1-.Lpost_getpc8)>>32
+; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB6_1-.Lpost_getpc6)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB6_1-.Lpost_getpc6)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[0:1]
 ; GFX12-NEXT:  .LBB6_2: ; %DummyReturnBlock
@@ -849,13 +821,15 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 3
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:  .LBB7_2: ; %Flow
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN-NEXT:    s_cbranch_vccz .LBB7_3
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc0 .LBB7_3
 ; GCN-NEXT:  ; %bb.5: ; %Flow
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc8:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32
+; GCN-NEXT:  .Lpost_getpc6:
+; GCN-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc6)&4294967295
+; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc6)>>32
 ; GCN-NEXT:    s_setpc_b64 s[0:1]
 ; GCN-NEXT:  .LBB7_3: ; %bb2
 ; GCN-NEXT:    ;;#ASMSTART
@@ -888,15 +862,17 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GFX11-NEXT:    s_cmp_lg_u32 s0, 3
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX11-NEXT:  .LBB7_2: ; %Flow
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[0:1]
-; GFX11-NEXT:    s_cbranch_vccz .LBB7_3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc0 .LBB7_3
 ; GFX11-NEXT:  ; %bb.5: ; %Flow
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc8:
+; GFX11-NEXT:  .Lpost_getpc7:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc7)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc7)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB7_3: ; %bb2
@@ -930,15 +906,17 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GFX12-NEXT:    s_cmp_lg_u32 s0, 3
 ; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX12-NEXT:  .LBB7_2: ; %Flow
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_vccz .LBB7_3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB7_3
 ; GFX12-NEXT:  ; %bb.5: ; %Flow
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc9:
+; GFX12-NEXT:  .Lpost_getpc7:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB7_4-.Lpost_getpc9)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB7_4-.Lpost_getpc9)>>32
+; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB7_4-.Lpost_getpc7)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB7_4-.Lpost_getpc7)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[0:1]
 ; GFX12-NEXT:  .LBB7_3: ; %bb2
@@ -994,9 +972,9 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_execnz .LBB8_1
 ; GCN-NEXT:  ; %bb.4: ; %entry
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc9:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB8_3-.Lpost_getpc9)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32
+; GCN-NEXT:  .Lpost_getpc7:
+; GCN-NEXT:    s_add_u32 s0, s0, (.LBB8_3-.Lpost_getpc7)&4294967295
+; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc7)>>32
 ; GCN-NEXT:    s_setpc_b64 s[0:1]
 ; GCN-NEXT:  .LBB8_1: ; %if
 ; GCN-NEXT:    s_load_dword s6, s[2:3], 0xb
@@ -1106,9 +1084,9 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GCN-NEXT:    s_cbranch_execnz .LBB9_3
 ; GCN-NEXT:  ; %bb.6: ; %Flow1
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc10:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32
+; GCN-NEXT:  .Lpost_getpc8:
+; GCN-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc8)&4294967295
+; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc8)>>32
 ; GCN-NEXT:    s_setpc_b64 s[0:1]
 ; GCN-NEXT:  .LBB9_3: ; %loop.preheader
 ; GCN-NEXT:    s_and_b64 vcc, exec, 0
@@ -1129,9 +1107,9 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GCN-NEXT:  ; %bb.8: ; %loop
 ; GCN-NEXT:    ; in Loop: Header=BB9_4 Depth=1
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc11:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32
+; GCN-NEXT:  .Lpost_getpc9:
+; GCN-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc9)&4294967295
+; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc9)>>32
 ; GCN-NEXT:    s_setpc_b64 s[0:1]
 ; GCN-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
 ; GCN-NEXT:    s_endpgm
@@ -1154,10 +1132,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GFX11-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX11-NEXT:  ; %bb.6: ; %Flow1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc9:
+; GFX11-NEXT:  .Lpost_getpc8:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc9)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc9)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc8)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc8)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB9_3: ; %loop.preheader
@@ -1179,10 +1157,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GFX11-NEXT:  ; %bb.8: ; %loop
 ; GFX11-NEXT:    ; in Loop: Header=BB9_4 Depth=1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:  .Lpost_getpc10:
+; GFX11-NEXT:  .Lpost_getpc9:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc10)&4294967295
-; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc10)>>32
+; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc9)&4294967295
+; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc9)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[0:1]
 ; GFX11-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
@@ -1206,10 +1184,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GFX12-NEXT:    s_cbranch_execnz .LBB9_3
 ; GFX12-NEXT:  ; %bb.6: ; %Flow1
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc10:
+; GFX12-NEXT:  .Lpost_getpc8:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32
+; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB9_5-.Lpost_getpc8)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB9_5-.Lpost_getpc8)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[0:1]
 ; GFX12-NEXT:  .LBB9_3: ; %loop.preheader
@@ -1230,10 +1208,10 @@ define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GFX12-NEXT:  ; %bb.8: ; %loop
 ; GFX12-NEXT:    ; in Loop: Header=BB9_4 Depth=1
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc11:
+; GFX12-NEXT:  .Lpost_getpc9:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32
+; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB9_4-.Lpost_getpc9)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB9_4-.Lpost_getpc9)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[0:1]
 ; GFX12-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
@@ -1277,9 +1255,9 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GCN-NEXT:  ; %bb.8: ; %bb
 ; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:  .Lpost_getpc12:
-; GCN-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295
-; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32
+; GCN-NEXT:  .Lpost_getpc10:
+; GCN-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc10)&4294967295
+; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc10)>>32
 ; GCN-NEXT:    s_setpc_b64 s[8:9]
 ; GCN-NEXT:  .LBB10_1: ; %bb13
 ; GCN-NEXT:    ;;#ASMSTART
@@ -1288,46 +1266,54 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    v_nop_e64
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_cbranch_execz .LBB10_3
-; GCN-NEXT:    s_branch .LBB10_4
+; GCN-NEXT:    s_mov_b64 s[10:11], 0
+; GCN-NEXT:    s_branch .LBB10_3
 ; GCN-NEXT:  .LBB10_2:
+; GCN-NEXT:    s_mov_b64 s[10:11], -1
 ; GCN-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-NEXT:  .LBB10_3: ; %bb9
+; GCN-NEXT:  .LBB10_3: ; %Flow
+; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB10_5
+; GCN-NEXT:  ; %bb.4: ; %bb9
 ; GCN-NEXT:    s_cmp_lt_i32 s7, 11
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_cmp_ge_i32 s6, s7
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GCN-NEXT:  .LBB10_4: ; %Flow5
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-NEXT:    s_cbranch_vccz .LBB10_5
+; GCN-NEXT:  .LBB10_5: ; %Flow5
+; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN-NEXT:    ; implicit-def: $sgpr4
+; GCN-NEXT:    s_cbranch_scc0 .LBB10_6
 ; GCN-NEXT:  ; %bb.10: ; %Flow5
 ; GCN-NEXT:    s_getpc_b64 s[0:1]
-; GCN-NEXT:  .Lpost_getpc13:
-; GCN-NEXT:    s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295
-; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32
+; GCN-NEXT:  .Lpost_getpc11:
+; GCN-NEXT:    s_add_u32 s0, s0, (.LBB10_7-.Lpost_getpc11)&4294967295
+; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB10_7-.Lpost_getpc11)>>32
 ; GCN-NEXT:    s_setpc_b64 s[0:1]
-; GCN-NEXT:  .LBB10_5: ; %bb14
+; GCN-NEXT:  .LBB10_6: ; %bb14
 ; GCN-NEXT:    s_cmp_lt_i32 s5, 9
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_cmp_lt_i32 s6, s7
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GCN-NEXT:    s_branch .LBB10_7
-; GCN-NEXT:  .LBB10_6:
-; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
 ; GCN-NEXT:  .LBB10_7: ; %bb19
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xf
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0xf
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[6:7], 2
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: long_branch_hang:
@@ -1342,35 +1328,45 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX11-NEXT:  ; %bb.8: ; %bb
 ; GFX11-NEXT:    s_getpc_b64 s[8:9]
-; GFX11-NEXT:  .Lpost_getpc11:
+; GFX11-NEXT:  .Lpost_getpc10:
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
-; GFX11-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc11)&4294967295
-; GFX11-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc11)>>32
+; GFX11-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc10)&4294967295
+; GFX11-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc10)>>32
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX11-NEXT:    s_setpc_b64 s[8:9]
 ; GFX11-NEXT:  .LBB10_1: ; %bb13
+; GFX11-NEXT:    s_mov_b64 s[10:11], 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    v_nop_e64
 ; GFX11-NEXT:    v_nop_e64
 ; GFX11-NEXT:    v_nop_e64
 ; GFX11-NEXT:    v_nop_e64
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_cbranch_execz .LBB10_3
-; GFX11-NEXT:    s_branch .LBB10_4
+; GFX11-NEXT:    s_branch .LBB10_3
 ; GFX11-NEXT:  .LBB10_2:
+; GFX11-NEXT:    s_mov_b64 s[10:11], -1
 ; GFX11-NEXT:    s_mov_b64 s[8:9], 0
-; GFX11-NEXT:  .LBB10_3: ; %bb9
+; GFX11-NEXT:  .LBB10_3: ; %Flow
+; GFX11-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB10_5
+; GFX11-NEXT:  ; %bb.4: ; %bb9
 ; GFX11-NEXT:    s_cmp_lt_i32 s7, 11
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GFX11-NEXT:    s_cmp_ge_i32 s6, s7
 ; GFX11-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:  .LBB10_4: ; %Flow5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[8:9]
-; GFX11-NEXT:    s_cbranch_vccnz .LBB10_6
-; GFX11-NEXT:  ; %bb.5: ; %bb14
+; GFX11-NEXT:  .LBB10_5: ; %Flow5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX11-NEXT:    ; implicit-def: $sgpr4
+; GFX11-NEXT:    s_cbranch_scc1 .LBB10_7
+; GFX11-NEXT:  ; %bb.6: ; %bb14
 ; GFX11-NEXT:    s_cmp_lt_i32 s5, 9
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s6, s7
@@ -1379,21 +1375,20 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX11-NEXT:    s_branch .LBB10_7
-; GFX11-NEXT:  .LBB10_6:
-; GFX11-NEXT:    ; implicit-def: $vgpr0
+; GFX11-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-NEXT:  .LBB10_7: ; %bb19
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x3c
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s0, s2, s0
 ; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: long_branch_hang:
@@ -1407,56 +1402,17 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX12-NEXT:    s_cmp_lt_i32 s7, 6
-; GFX12-NEXT:    s_cbranch_scc0 .LBB10_1
-; GFX12-NEXT:  ; %bb.18: ; %bb
-; GFX12-NEXT:    s_getpc_b64 s[10:11]
-; GFX12-NEXT:  .Lpost_getpc17:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc17)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc17)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[10:11]
-; GFX12-NEXT:  .LBB10_1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccnz .LBB10_2
-; GFX12-NEXT:  ; %bb.10: ; %Flow
+; GFX12-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX12-NEXT:  ; %bb.7: ; %bb
 ; GFX12-NEXT:    s_getpc_b64 s[8:9]
-; GFX12-NEXT:  .Lpost_getpc13:
+; GFX12-NEXT:  .Lpost_getpc10:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc13)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32
+; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc10)&4294967295
+; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc10)>>32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_setpc_b64 s[8:9]
-; GFX12-NEXT:  .LBB10_2: ; %Flow5
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_vccz .LBB10_3
-; GFX12-NEXT:  ; %bb.12: ; %Flow5
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc14:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB10_6-.Lpost_getpc14)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB10_6-.Lpost_getpc14)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[0:1]
-; GFX12-NEXT:  .LBB10_3: ; %bb14
-; GFX12-NEXT:    s_cmp_lt_i32 s5, 9
-; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX12-NEXT:    s_cmp_lt_i32 s6, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_or_b32 s1, s4, s1
-; GFX12-NEXT:    s_and_b32 s0, s0, s1
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-NEXT:  ; %bb.8: ; %bb14
-; GFX12-NEXT:    s_getpc_b64 s[0:1]
-; GFX12-NEXT:  .Lpost_getpc12:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB10_7-.Lpost_getpc12)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB10_7-.Lpost_getpc12)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[0:1]
-; GFX12-NEXT:  .LBB10_4: ; %bb13
+; GFX12-NEXT:  .LBB10_1: ; %bb13
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    s_mov_b32 s1, s8
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    v_nop_e64
@@ -1464,45 +1420,47 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
 ; GFX12-NEXT:    v_nop_e64
 ; GFX12-NEXT:    v_nop_e64
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_cbranch_execz .LBB10_5
-; GFX12-NEXT:  ; %bb.14: ; %bb13
-; GFX12-NEXT:    s_getpc_b64 s[8:9]
-; GFX12-NEXT:  .Lpost_getpc15:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc15)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc15)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[8:9]
-; GFX12-NEXT:  .LBB10_5: ; %bb9
+; GFX12-NEXT:  .LBB10_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB10_4
+; GFX12-NEXT:  ; %bb.3: ; %bb9
 ; GFX12-NEXT:    s_cmp_lt_i32 s7, 11
 ; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX12-NEXT:    s_cmp_ge_i32 s6, s7
 ; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_and_b32 s1, s4, s1
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB10_6
-; GFX12-NEXT:  ; %bb.16: ; %bb9
-; GFX12-NEXT:    s_getpc_b64 s[8:9]
-; GFX12-NEXT:  .Lpost_getpc16:
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc16)&4294967295
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc16)>>32
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_setpc_b64 s[8:9]
-; GFX12-NEXT:  .LBB10_6:
-; GFX12-NEXT:    ; implicit-def: $vgpr0
-; GFX12-NEXT:  .LBB10_7: ; %bb19
+; GFX12-NEXT:  .LBB10_4: ; %Flow5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 1
+; GFX12-NEXT:    ; implicit-def: $sgpr1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB10_6
+; GFX12-NEXT:  ; %bb.5: ; %bb14
+; GFX12-NEXT:    s_cmp_lt_i32 s5, 9
+; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX12-NEXT:    s_cmp_lt_i32 s6, s7
+; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s1, s4, s1
+; GFX12-NEXT:    s_and_b32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-NEXT:  .LBB10_6: ; %bb19
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x3c
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x3c
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
+; GFX12-NEXT:    s_lshl_b64 s[0:1], s[4:5], 2
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
-; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_endpgm
 bb:
   %tmp = icmp slt i32 %arg2, 9
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 8352376a9c1371..a77ab7d332fbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -515,14 +515,17 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_mov_b32 s8, s4
 ; CISI-NEXT:    v_mov_b32_e32 v0, s13
-; CISI-NEXT:    s_mov_b32 s9, s5
 ; CISI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
+; CISI-NEXT:    s_mov_b32 s9, s5
+; CISI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; CISI-NEXT:    s_cselect_b32 s4, 1, 0
 ; CISI-NEXT:    s_mov_b32 s0, s6
 ; CISI-NEXT:    s_mov_b32 s1, s7
 ; CISI-NEXT:    s_mov_b32 s3, s11
-; CISI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CISI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; CISI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; CISI-NEXT:    s_waitcnt expcnt(0)
+; CISI-NEXT:    v_mov_b32_e32 v0, s4
+; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: uaddo32_vcc_user:
@@ -532,13 +535,15 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: uaddo32_vcc_user:
@@ -549,9 +554,11 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: uaddo32_vcc_user:
@@ -559,12 +566,14 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1010-NEXT:    s_clause 0x1
 ; GFX1010-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_add_co_u32 v1, s0, s0, s1
-; GFX1010-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX1010-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX1010-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX1010-NEXT:    v_add_co_u32 v0, s0, s0, s1
+; GFX1010-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1010-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1010-NEXT:    global_store_byte v1, v2, s[6:7]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: uaddo32_vcc_user:
@@ -572,12 +581,14 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1030W32-NEXT:    s_clause 0x1
 ; GFX1030W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT:    v_add_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT:    v_add_co_u32 v0, s4, s4, s5
+; GFX1030W32-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1030W32-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1030W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1030W32-NEXT:    global_store_byte v1, v2, s[2:3]
 ; GFX1030W32-NEXT:    s_endpgm
 ;
 ; GFX1030W64-LABEL: uaddo32_vcc_user:
@@ -585,12 +596,14 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1030W64-NEXT:    s_clause 0x1
 ; GFX1030W64-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT:    v_add_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT:    v_add_co_u32 v0, s[4:5], s4, s5
+; GFX1030W64-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1030W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1030W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1030W64-NEXT:    global_store_byte v1, v2, s[2:3]
 ; GFX1030W64-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: uaddo32_vcc_user:
@@ -598,14 +611,15 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v1, s4, s4, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b8 v1, v2, s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -632,16 +646,19 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_addc_u32 s7, s5, s7
 ; CISI-NEXT:    v_mov_b32_e32 v1, s5
 ; CISI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; CISI-NEXT:    v_mov_b32_e32 v2, s6
+; CISI-NEXT:    v_mov_b32_e32 v0, s6
+; CISI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
+; CISI-NEXT:    v_mov_b32_e32 v1, s7
+; CISI-NEXT:    s_cselect_b32 s4, 1, 0
 ; CISI-NEXT:    s_mov_b32 s0, s2
 ; CISI-NEXT:    s_mov_b32 s1, s3
 ; CISI-NEXT:    s_mov_b32 s2, s10
 ; CISI-NEXT:    s_mov_b32 s3, s11
-; CISI-NEXT:    v_mov_b32_e32 v3, s7
-; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CISI-NEXT:    s_waitcnt expcnt(0)
+; CISI-NEXT:    v_mov_b32_e32 v0, s4
 ; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
@@ -655,31 +672,35 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_addc_u32 s1, s5, s7
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v7, s1
 ; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: suaddo64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, s8, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: suaddo64:
@@ -690,9 +711,11 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    s_add_u32 s0, s8, s10
 ; GFX1010-NEXT:    s_addc_u32 s1, s9, s11
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    v_cmp_lt_u64_e64 s2, s[0:1], s[8:9]
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    global_store_byte v2, v3, s[6:7]
 ; GFX1010-NEXT:    s_endpgm
@@ -707,7 +730,9 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1030W32-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1030W32-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1030W32-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1030W32-NEXT:    s_endpgm
@@ -722,7 +747,9 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1030W64-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX1030W64-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1030W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1030W64-NEXT:    s_endpgm
@@ -733,11 +760,13 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s6, s4, s6
 ; GFX11-NEXT:    s_addc_u32 s7, s5, s7
-; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
@@ -1391,14 +1420,17 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_mov_b32 s8, s4
 ; CISI-NEXT:    v_mov_b32_e32 v0, s13
-; CISI-NEXT:    s_mov_b32 s9, s5
 ; CISI-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
+; CISI-NEXT:    s_mov_b32 s9, s5
+; CISI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; CISI-NEXT:    s_cselect_b32 s4, 1, 0
 ; CISI-NEXT:    s_mov_b32 s0, s6
 ; CISI-NEXT:    s_mov_b32 s1, s7
 ; CISI-NEXT:    s_mov_b32 s3, s11
-; CISI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CISI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; CISI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; CISI-NEXT:    s_waitcnt expcnt(0)
+; CISI-NEXT:    v_mov_b32_e32 v0, s4
+; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: usubo32_vcc_user:
@@ -1408,13 +1440,15 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: usubo32_vcc_user:
@@ -1425,9 +1459,11 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: usubo32_vcc_user:
@@ -1435,12 +1471,14 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1010-NEXT:    s_clause 0x1
 ; GFX1010-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX1010-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT:    v_sub_co_u32 v1, s0, s0, s1
-; GFX1010-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX1010-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX1010-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX1010-NEXT:    v_sub_co_u32 v0, s0, s0, s1
+; GFX1010-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1010-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX1010-NEXT:    global_store_byte v1, v2, s[6:7]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1030W32-LABEL: usubo32_vcc_user:
@@ -1448,12 +1486,14 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1030W32-NEXT:    s_clause 0x1
 ; GFX1030W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX1030W32-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT:    v_sub_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT:    v_sub_co_u32 v0, s4, s4, s5
+; GFX1030W32-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1030W32-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1030W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1030W32-NEXT:    global_store_byte v1, v2, s[2:3]
 ; GFX1030W32-NEXT:    s_endpgm
 ;
 ; GFX1030W64-LABEL: usubo32_vcc_user:
@@ -1461,12 +1501,14 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX1030W64-NEXT:    s_clause 0x1
 ; GFX1030W64-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
 ; GFX1030W64-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT:    v_sub_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT:    global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT:    v_sub_co_u32 v0, s[4:5], s4, s5
+; GFX1030W64-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1030W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1030W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1030W64-NEXT:    global_store_byte v1, v2, s[2:3]
 ; GFX1030W64-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: usubo32_vcc_user:
@@ -1474,14 +1516,15 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_sub_co_u32 v1, s4, s4, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    v_sub_co_u32 v0, s4, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    global_store_b8 v1, v2, s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
@@ -1508,16 +1551,19 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_subb_u32 s7, s5, s7
 ; CISI-NEXT:    v_mov_b32_e32 v1, s5
 ; CISI-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; CISI-NEXT:    v_mov_b32_e32 v2, s6
+; CISI-NEXT:    v_mov_b32_e32 v0, s6
+; CISI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
+; CISI-NEXT:    v_mov_b32_e32 v1, s7
+; CISI-NEXT:    s_cselect_b32 s4, 1, 0
 ; CISI-NEXT:    s_mov_b32 s0, s2
 ; CISI-NEXT:    s_mov_b32 s1, s3
 ; CISI-NEXT:    s_mov_b32 s2, s10
 ; CISI-NEXT:    s_mov_b32 s3, s11
-; CISI-NEXT:    v_mov_b32_e32 v3, s7
-; CISI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CISI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CISI-NEXT:    s_waitcnt expcnt(0)
+; CISI-NEXT:    v_mov_b32_e32 v0, s4
 ; CISI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; CISI-NEXT:    s_endpgm
 ;
@@ -1531,31 +1577,35 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_subb_u32 s1, s5, s7
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v7, s1
 ; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: susubo64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sub_u32 s0, s8, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_subb_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: susubo64:
@@ -1566,9 +1616,11 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    s_sub_u32 s0, s8, s10
 ; GFX1010-NEXT:    s_subb_u32 s1, s9, s11
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-NEXT:    v_cmp_gt_u64_e64 s2, s[0:1], s[8:9]
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
-; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    global_store_byte v2, v3, s[6:7]
 ; GFX1010-NEXT:    s_endpgm
@@ -1583,7 +1635,9 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1030W32-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1030W32-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1030W32-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W32-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W32-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1030W32-NEXT:    s_endpgm
@@ -1598,7 +1652,9 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1030W64-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX1030W64-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX1030W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1030W64-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX1030W64-NEXT:    global_store_byte v2, v3, s[2:3]
 ; GFX1030W64-NEXT:    s_endpgm
@@ -1609,11 +1665,13 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s6, s4, s6
 ; GFX11-NEXT:    s_subb_u32 s7, s5, s7
-; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
@@ -1779,12 +1837,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CISI-NEXT:    s_mov_b32 s0, 0
 ; CISI-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
 ; CISI-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; CISI-NEXT:    s_cbranch_vccz .LBB16_4
+; CISI-NEXT:    s_cbranch_vccz .LBB16_2
 ; CISI-NEXT:  ; %bb.1:
 ; CISI-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; CISI-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CISI-NEXT:    s_sub_u32 s0, 0, s2
 ; CISI-NEXT:    s_subb_u32 s1, 0, s3
+; CISI-NEXT:    s_mov_b64 s[8:9], 0
 ; CISI-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; CISI-NEXT:    v_rcp_f32_e32 v0, v0
 ; CISI-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1887,8 +1946,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CISI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CISI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; CISI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CISI-NEXT:    s_cbranch_execnz .LBB16_3
+; CISI-NEXT:    s_branch .LBB16_3
 ; CISI-NEXT:  .LBB16_2:
+; CISI-NEXT:    s_mov_b64 s[8:9], -1
+; CISI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CISI-NEXT:  .LBB16_3: ; %Flow
+; CISI-NEXT:    s_and_b64 s[0:1], s[8:9], exec
+; CISI-NEXT:    s_cselect_b32 s0, 1, 0
+; CISI-NEXT:    s_cmp_lg_u32 s0, 1
+; CISI-NEXT:    s_cbranch_scc1 .LBB16_5
+; CISI-NEXT:  ; %bb.4:
 ; CISI-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; CISI-NEXT:    s_sub_i32 s0, 0, s2
 ; CISI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1912,14 +1979,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; CISI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; CISI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CISI-NEXT:    v_mov_b32_e32 v1, 0
-; CISI-NEXT:  .LBB16_3:
+; CISI-NEXT:  .LBB16_5:
 ; CISI-NEXT:    s_mov_b32 s7, 0xf000
 ; CISI-NEXT:    s_mov_b32 s6, -1
 ; CISI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CISI-NEXT:    s_endpgm
-; CISI-NEXT:  .LBB16_4:
-; CISI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; CISI-NEXT:    s_branch .LBB16_2
 ;
 ; VI-LABEL: sudiv64:
 ; VI:       ; %bb.0:
@@ -1929,7 +1993,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_or_b64 s[0:1], s[6:7], s[2:3]
 ; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_cbranch_scc0 .LBB16_4
+; VI-NEXT:    s_cbranch_scc0 .LBB16_2
 ; VI-NEXT:  ; %bb.1:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s3
@@ -2042,10 +2106,19 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, s10
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    s_cbranch_execnz .LBB16_3
+; VI-NEXT:    s_branch .LBB16_3
 ; VI-NEXT:  .LBB16_2:
+; VI-NEXT:    s_mov_b64 s[8:9], -1
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:  .LBB16_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[0:1], s[8:9], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s0, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB16_5
+; VI-NEXT:  ; %bb.4:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; VI-NEXT:    s_sub_i32 s0, 0, s2
 ; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -2069,14 +2142,11 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:  .LBB16_3:
+; VI-NEXT:  .LBB16_5:
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB16_4:
-; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; VI-NEXT:    s_branch .LBB16_2
 ;
 ; GFX9-LABEL: sudiv64:
 ; GFX9:       ; %bb.0:
@@ -2086,12 +2156,13 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[6:7], s[8:9]
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_cbranch_scc0 .LBB16_4
+; GFX9-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s8
 ; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2212,8 +2283,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_cbranch_execnz .LBB16_3
+; GFX9-NEXT:    s_branch .LBB16_3
 ; GFX9-NEXT:  .LBB16_2:
+; GFX9-NEXT:    s_mov_b64 s[2:3], -1
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:  .LBB16_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX9-NEXT:  ; %bb.4:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    s_sub_i32 s0, 0, s8
 ; GFX9-NEXT:    s_mov_b32 s1, 0
@@ -2237,13 +2316,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:  .LBB16_3:
+; GFX9-NEXT:  .LBB16_5:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
-; GFX9-NEXT:  .LBB16_4:
-; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT:    s_branch .LBB16_2
 ;
 ; GFX1010-LABEL: sudiv64:
 ; GFX1010:       ; %bb.0:
@@ -2254,7 +2330,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_or_b64 s[2:3], s[6:7], s[8:9]
 ; GFX1010-NEXT:    s_mov_b32 s2, 0
 ; GFX1010-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1010-NEXT:    s_cbranch_scc0 .LBB16_4
+; GFX1010-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX1010-NEXT:  ; %bb.1:
 ; GFX1010-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX1010-NEXT:    v_cvt_f32_u32_e32 v1, s9
@@ -2373,9 +2449,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1010-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX1010-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX1010-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s2
-; GFX1010-NEXT:    s_cbranch_vccnz .LBB16_3
+; GFX1010-NEXT:    s_branch .LBB16_3
 ; GFX1010-NEXT:  .LBB16_2:
+; GFX1010-NEXT:    s_mov_b32 s2, -1
+; GFX1010-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1010-NEXT:  .LBB16_3: ; %Flow
+; GFX1010-NEXT:    s_and_b32 s0, s2, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1010-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX1010-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX1010-NEXT:  ; %bb.4:
 ; GFX1010-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX1010-NEXT:    s_sub_i32 s1, 0, s8
 ; GFX1010-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -2399,13 +2482,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1010-NEXT:  .LBB16_3:
+; GFX1010-NEXT:  .LBB16_5:
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1010-NEXT:    s_endpgm
-; GFX1010-NEXT:  .LBB16_4:
-; GFX1010-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1010-NEXT:    s_branch .LBB16_2
 ;
 ; GFX1030W32-LABEL: sudiv64:
 ; GFX1030W32:       ; %bb.0:
@@ -2416,7 +2496,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_or_b64 s[8:9], s[6:7], s[2:3]
 ; GFX1030W32-NEXT:    s_mov_b32 s8, 0
 ; GFX1030W32-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1030W32-NEXT:    s_cbranch_scc0 .LBB16_4
+; GFX1030W32-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX1030W32-NEXT:  ; %bb.1:
 ; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v1, s3
@@ -2535,9 +2615,16 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1030W32-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc_lo
 ; GFX1030W32-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX1030W32-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
-; GFX1030W32-NEXT:    s_cbranch_vccnz .LBB16_3
+; GFX1030W32-NEXT:    s_branch .LBB16_3
 ; GFX1030W32-NEXT:  .LBB16_2:
+; GFX1030W32-NEXT:    s_mov_b32 s8, -1
+; GFX1030W32-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1030W32-NEXT:  .LBB16_3: ; %Flow
+; GFX1030W32-NEXT:    s_and_b32 s0, s8, exec_lo
+; GFX1030W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1030W32-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX1030W32-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX1030W32-NEXT:  ; %bb.4:
 ; GFX1030W32-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX1030W32-NEXT:    s_sub_i32 s1, 0, s2
 ; GFX1030W32-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -2561,13 +2648,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_cselect_b32 s0, s3, s0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1030W32-NEXT:  .LBB16_3:
+; GFX1030W32-NEXT:  .LBB16_5:
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1030W32-NEXT:    s_endpgm
-; GFX1030W32-NEXT:  .LBB16_4:
-; GFX1030W32-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1030W32-NEXT:    s_branch .LBB16_2
 ;
 ; GFX1030W64-LABEL: sudiv64:
 ; GFX1030W64:       ; %bb.0:
@@ -2578,7 +2662,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    s_or_b64 s[0:1], s[6:7], s[2:3]
 ; GFX1030W64-NEXT:    s_mov_b32 s0, 0
 ; GFX1030W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1030W64-NEXT:    s_cbranch_scc0 .LBB16_4
+; GFX1030W64-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX1030W64-NEXT:  ; %bb.1:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s3
@@ -2692,13 +2776,22 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s[0:1]
+; GFX1030W64-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc
 ; GFX1030W64-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc
 ; GFX1030W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e32 v1, s11, v1, vcc
 ; GFX1030W64-NEXT:    v_cndmask_b32_e32 v0, s10, v2, vcc
-; GFX1030W64-NEXT:    s_cbranch_execnz .LBB16_3
+; GFX1030W64-NEXT:    s_branch .LBB16_3
 ; GFX1030W64-NEXT:  .LBB16_2:
+; GFX1030W64-NEXT:    s_mov_b64 s[0:1], -1
+; GFX1030W64-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1030W64-NEXT:  .LBB16_3: ; %Flow
+; GFX1030W64-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX1030W64-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1030W64-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX1030W64-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX1030W64-NEXT:  ; %bb.4:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX1030W64-NEXT:    s_sub_i32 s1, 0, s2
 ; GFX1030W64-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -2722,13 +2815,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    s_cselect_b32 s0, s3, s0
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1030W64-NEXT:  .LBB16_3:
+; GFX1030W64-NEXT:  .LBB16_5:
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX1030W64-NEXT:    s_endpgm
-; GFX1030W64-NEXT:  .LBB16_4:
-; GFX1030W64-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1030W64-NEXT:    s_branch .LBB16_2
 ;
 ; GFX11-LABEL: sudiv64:
 ; GFX11:       ; %bb.0:
@@ -2740,7 +2830,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_mov_b32 s8, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB16_4
+; GFX11-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, s3
@@ -2873,9 +2963,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, s9, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB16_3
+; GFX11-NEXT:    s_branch .LBB16_3
 ; GFX11-NEXT:  .LBB16_2:
+; GFX11-NEXT:    s_mov_b32 s8, -1
+; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-NEXT:  .LBB16_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX11-NEXT:  ; %bb.4:
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s1, 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2905,13 +3003,10 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_cselect_b32 s0, s3, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:  .LBB16_3:
+; GFX11-NEXT:  .LBB16_5:
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_endpgm
-; GFX11-NEXT:  .LBB16_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_branch .LBB16_2
   %result = udiv i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index a25078230385ec..ed30786cbaabe9 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -8,55 +8,65 @@ define i32 @combine_add_zext_xor() {
 ; GFX1010-LABEL: combine_add_zext_xor:
 ; GFX1010:       ; %bb.0: ; %.entry
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-NEXT:    s_branch .LBB0_2
 ; GFX1010-NEXT:  .LBB0_1: ; %bb9
 ; GFX1010-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX1010-NEXT:    s_xor_b32 s4, s4, -1
-; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX1010-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1010-NEXT:    s_cbranch_vccz .LBB0_4
+; GFX1010-NEXT:    s_xor_b32 s5, s5, -1
+; GFX1010-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX1010-NEXT:    s_add_i32 s6, s4, s5
+; GFX1010-NEXT:    s_cmpk_gt_i32 s4, 0xfbe6
+; GFX1010-NEXT:    s_mov_b32 s4, s6
+; GFX1010-NEXT:    s_cbranch_scc0 .LBB0_4
 ; GFX1010-NEXT:  .LBB0_2: ; %.a
 ; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1010-NEXT:    ; implicit-def: $sgpr4
+; GFX1010-NEXT:    ; implicit-def: $sgpr5
 ; GFX1010-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX1010-NEXT:  ; %bb.3: ; %bb
 ; GFX1010-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-NEXT:    buffer_load_dword v0, v0, s[4:7], 64 offen glc
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
+; GFX1010-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX1010-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1010-NEXT:    s_branch .LBB0_1
 ; GFX1010-NEXT:  .LBB0_4: ; %.exit
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_add_zext_xor:
 ; GFX1100:       ; %bb.0: ; %.entry
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:    s_branch .LBB0_2
 ; GFX1100-NEXT:  .LBB0_1: ; %bb9
 ; GFX1100-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    s_xor_b32 s0, s0, -1
-; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX1100-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1100-NEXT:    s_cbranch_vccz .LBB0_4
+; GFX1100-NEXT:    s_xor_b32 s1, s1, -1
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1100-NEXT:    s_add_i32 s2, s0, s1
+; GFX1100-NEXT:    s_cmpk_gt_i32 s0, 0xfbe6
+; GFX1100-NEXT:    s_mov_b32 s0, s2
+; GFX1100-NEXT:    s_cbranch_scc0 .LBB0_4
 ; GFX1100-NEXT:  .LBB0_2: ; %.a
 ; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1100-NEXT:    ; implicit-def: $sgpr0
+; GFX1100-NEXT:    ; implicit-def: $sgpr1
 ; GFX1100-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX1100-NEXT:  ; %bb.3: ; %bb
 ; GFX1100-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-NEXT:    buffer_load_b32 v0, v0, s[0:3], 64 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX1100-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1100-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX1100-NEXT:    s_branch .LBB0_1
 ; GFX1100-NEXT:  .LBB0_4: ; %.exit
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -88,55 +98,65 @@ define i32 @combine_sub_zext_xor() {
 ; GFX1010-LABEL: combine_sub_zext_xor:
 ; GFX1010:       ; %bb.0: ; %.entry
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-NEXT:    s_branch .LBB1_2
 ; GFX1010-NEXT:  .LBB1_1: ; %bb9
 ; GFX1010-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GFX1010-NEXT:    s_xor_b32 s4, s4, -1
-; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX1010-NEXT:    v_sub_nc_u32_e32 v2, v1, v0
-; GFX1010-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1010-NEXT:    s_cbranch_vccz .LBB1_4
+; GFX1010-NEXT:    s_xor_b32 s5, s5, -1
+; GFX1010-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX1010-NEXT:    s_sub_i32 s6, s4, s5
+; GFX1010-NEXT:    s_cmpk_gt_i32 s4, 0xfbe6
+; GFX1010-NEXT:    s_mov_b32 s4, s6
+; GFX1010-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX1010-NEXT:  .LBB1_2: ; %.a
 ; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1010-NEXT:    ; implicit-def: $sgpr4
+; GFX1010-NEXT:    ; implicit-def: $sgpr5
 ; GFX1010-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1010-NEXT:  ; %bb.3: ; %bb
 ; GFX1010-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-NEXT:    buffer_load_dword v0, v0, s[4:7], 64 offen glc
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
+; GFX1010-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX1010-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1010-NEXT:    s_branch .LBB1_1
 ; GFX1010-NEXT:  .LBB1_4: ; %.exit
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_sub_zext_xor:
 ; GFX1100:       ; %bb.0: ; %.entry
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:    s_branch .LBB1_2
 ; GFX1100-NEXT:  .LBB1_1: ; %bb9
 ; GFX1100-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    s_xor_b32 s0, s0, -1
-; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_sub_nc_u32_e32 v2, v1, v0
-; GFX1100-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1100-NEXT:    s_cbranch_vccz .LBB1_4
+; GFX1100-NEXT:    s_xor_b32 s1, s1, -1
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1100-NEXT:    s_sub_i32 s2, s0, s1
+; GFX1100-NEXT:    s_cmpk_gt_i32 s0, 0xfbe6
+; GFX1100-NEXT:    s_mov_b32 s0, s2
+; GFX1100-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX1100-NEXT:  .LBB1_2: ; %.a
 ; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1100-NEXT:    ; implicit-def: $sgpr0
+; GFX1100-NEXT:    ; implicit-def: $sgpr1
 ; GFX1100-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1100-NEXT:  ; %bb.3: ; %bb
 ; GFX1100-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-NEXT:    buffer_load_b32 v0, v0, s[0:3], 64 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX1100-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1100-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX1100-NEXT:    s_branch .LBB1_1
 ; GFX1100-NEXT:  .LBB1_4: ; %.exit
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -190,7 +210,9 @@ define i32 @combine_add_zext_or() {
 ; GFX1010-NEXT:    s_branch .LBB2_1
 ; GFX1010-NEXT:  .LBB2_4: ; %.exit
 ; GFX1010-NEXT:    s_or_b32 s4, s5, s6
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1010-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_add_zext_or:
@@ -218,8 +240,10 @@ define i32 @combine_add_zext_or() {
 ; GFX1100-NEXT:    s_branch .LBB2_1
 ; GFX1100-NEXT:  .LBB2_4: ; %.exit
 ; GFX1100-NEXT:    s_or_b32 s0, s1, s2
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -274,7 +298,9 @@ define i32 @combine_sub_zext_or() {
 ; GFX1010-NEXT:    s_branch .LBB3_1
 ; GFX1010-NEXT:  .LBB3_4: ; %.exit
 ; GFX1010-NEXT:    s_or_b32 s4, s5, s6
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1010-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_sub_zext_or:
@@ -302,8 +328,10 @@ define i32 @combine_sub_zext_or() {
 ; GFX1100-NEXT:    s_branch .LBB3_1
 ; GFX1100-NEXT:  .LBB3_4: ; %.exit
 ; GFX1100-NEXT:    s_or_b32 s0, s1, s2
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -336,53 +364,67 @@ define i32 @combine_add_zext_and() {
 ; GFX1010-LABEL: combine_add_zext_and:
 ; GFX1010:       ; %bb.0: ; %.entry
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-NEXT:    s_branch .LBB4_2
 ; GFX1010-NEXT:  .LBB4_1: ; %bb9
 ; GFX1010-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1010-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX1010-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX1010-NEXT:    s_cmpk_gt_i32 s4, 0xfbe6
+; GFX1010-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX1010-NEXT:    s_and_b32 s5, s5, s6
+; GFX1010-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX1010-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; GFX1010-NEXT:    s_add_i32 s4, s4, s5
 ; GFX1010-NEXT:    s_cbranch_vccz .LBB4_4
 ; GFX1010-NEXT:  .LBB4_2: ; %.a
 ; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1010-NEXT:    ; implicit-def: $sgpr4
+; GFX1010-NEXT:    ; implicit-def: $sgpr5
 ; GFX1010-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX1010-NEXT:  ; %bb.3: ; %bb
 ; GFX1010-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-NEXT:    buffer_load_dword v0, v0, s[4:7], 64 offen glc
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
+; GFX1010-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX1010-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1010-NEXT:    s_branch .LBB4_1
 ; GFX1010-NEXT:  .LBB4_4: ; %.exit
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_add_zext_and:
 ; GFX1100:       ; %bb.0: ; %.entry
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:    s_branch .LBB4_2
 ; GFX1100-NEXT:  .LBB4_1: ; %bb9
 ; GFX1100-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1100-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX1100-NEXT:    s_cmpk_gt_i32 s0, 0xfbe6
+; GFX1100-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s1, s1, s2
+; GFX1100-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1100-NEXT:    s_add_i32 s0, s0, s1
 ; GFX1100-NEXT:    s_cbranch_vccz .LBB4_4
 ; GFX1100-NEXT:  .LBB4_2: ; %.a
 ; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1100-NEXT:    ; implicit-def: $sgpr0
+; GFX1100-NEXT:    ; implicit-def: $sgpr1
 ; GFX1100-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX1100-NEXT:  ; %bb.3: ; %bb
 ; GFX1100-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-NEXT:    buffer_load_b32 v0, v0, s[0:3], 64 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX1100-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1100-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX1100-NEXT:    s_branch .LBB4_1
 ; GFX1100-NEXT:  .LBB4_4: ; %.exit
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -415,53 +457,67 @@ define i32 @combine_sub_zext_and() {
 ; GFX1010-LABEL: combine_sub_zext_and:
 ; GFX1010:       ; %bb.0: ; %.entry
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1010-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-NEXT:    s_branch .LBB5_2
 ; GFX1010-NEXT:  .LBB5_1: ; %bb9
 ; GFX1010-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; GFX1010-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1010-NEXT:    s_and_b32 s4, s4, vcc_lo
-; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX1010-NEXT:    v_sub_nc_u32_e32 v1, v1, v0
+; GFX1010-NEXT:    s_cmpk_gt_i32 s4, 0xfbe6
+; GFX1010-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX1010-NEXT:    s_and_b32 s5, s5, s6
+; GFX1010-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX1010-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX1010-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; GFX1010-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX1010-NEXT:    s_cbranch_vccz .LBB5_4
 ; GFX1010-NEXT:  .LBB5_2: ; %.a
 ; GFX1010-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1010-NEXT:    ; implicit-def: $sgpr4
+; GFX1010-NEXT:    ; implicit-def: $sgpr5
 ; GFX1010-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1010-NEXT:  ; %bb.3: ; %bb
 ; GFX1010-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; GFX1010-NEXT:    buffer_load_dword v0, v1, s[4:7], 64 offen glc
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1010-NEXT:    buffer_load_dword v0, v0, s[4:7], 64 offen glc
 ; GFX1010-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
+; GFX1010-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1010-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX1010-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1010-NEXT:    s_branch .LBB5_1
 ; GFX1010-NEXT:  .LBB5_4: ; %.exit
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_sub_zext_and:
 ; GFX1100:       ; %bb.0: ; %.entry
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:    s_branch .LBB5_2
 ; GFX1100-NEXT:  .LBB5_1: ; %bb9
 ; GFX1100-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1
-; GFX1100-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX1100-NEXT:    v_sub_nc_u32_e32 v1, v1, v0
+; GFX1100-NEXT:    s_cmpk_gt_i32 s0, 0xfbe6
+; GFX1100-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_and_b32 s1, s1, s2
+; GFX1100-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1100-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1100-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX1100-NEXT:    s_cbranch_vccz .LBB5_4
 ; GFX1100-NEXT:  .LBB5_2: ; %.a
 ; GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1100-NEXT:    ; implicit-def: $sgpr0
+; GFX1100-NEXT:    ; implicit-def: $sgpr1
 ; GFX1100-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1100-NEXT:  ; %bb.3: ; %bb
 ; GFX1100-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; GFX1100-NEXT:    buffer_load_b32 v0, v1, s[0:3], 64 offen glc
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1100-NEXT:    buffer_load_b32 v0, v0, s[0:3], 64 offen glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX1100-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX1100-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX1100-NEXT:    s_branch .LBB5_1
 ; GFX1100-NEXT:  .LBB5_4: ; %.exit
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf98..f42e5dc4906337 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -475,9 +475,8 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    s_min_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_i32 s0, 0x3e9
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -495,9 +494,8 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e8
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -515,9 +513,8 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    s_min_u32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_lt_u32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -535,9 +532,8 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_ge_i32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -555,9 +551,8 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    s_max_u32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_u32 s0, 0x3e9
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -575,9 +570,8 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    s_min_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e7
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -595,9 +589,8 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_le_i32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -615,9 +608,8 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    s_min_u32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_cselect_b32 s0, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 40929d58834472..ffdba791e281d7 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1492,28 +1492,34 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
 ; SI-NEXT:    s_cmp_lg_u32 s5, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB14_4
+; SI-NEXT:    s_cbranch_scc0 .LBB14_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
-; SI-NEXT:    s_cbranch_execnz .LBB14_3
-; SI-NEXT:  .LBB14_2: ; %if
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_branch .LBB14_3
+; SI-NEXT:  .LBB14_2:
+; SI-NEXT:    s_mov_b64 s[2:3], -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:  .LBB14_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s2, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB14_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_and_b32 s2, s4, 0xffff
 ; SI-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:  .LBB14_3: ; %endif
+; SI-NEXT:  .LBB14_5: ; %endif
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB14_4:
-; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    s_branch .LBB14_2
 ;
 ; VI-LABEL: ctpop_i16_in_br:
 ; VI:       ; %bb.0: ; %entry
@@ -1522,28 +1528,34 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    s_cmp_lg_u32 s5, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB14_4
+; VI-NEXT:    s_cbranch_scc0 .LBB14_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
 ; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
-; VI-NEXT:    s_cbranch_execnz .LBB14_3
-; VI-NEXT:  .LBB14_2: ; %if
+; VI-NEXT:    s_mov_b64 s[2:3], 0
+; VI-NEXT:    s_branch .LBB14_3
+; VI-NEXT:  .LBB14_2:
+; VI-NEXT:    s_mov_b64 s[2:3], -1
+; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:  .LBB14_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s2, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB14_5
+; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_and_b32 s2, s4, 0xffff
 ; VI-NEXT:    s_bcnt1_i32_b32 s2, s2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:  .LBB14_3: ; %endif
+; VI-NEXT:  .LBB14_5: ; %endif
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB14_4:
-; VI-NEXT:    ; implicit-def: $vgpr0
-; VI-NEXT:    s_branch .LBB14_2
 ;
 ; EG-LABEL: ctpop_i16_in_br:
 ; EG:       ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 1c16612bed37fc..38fa5fd5a2e7cb 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -339,27 +339,31 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB7_4
+; SI-NEXT:    s_cbranch_scc0 .LBB7_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; SI-NEXT:    s_mov_b64 s[6:7], 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; SI-NEXT:    s_branch .LBB7_3
+; SI-NEXT:  .LBB7_2:
+; SI-NEXT:    s_mov_b64 s[6:7], -1
+; SI-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; SI-NEXT:  .LBB7_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s6, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB7_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB7_3
-; SI-NEXT:  .LBB7_2: ; %if
 ; SI-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
 ; SI-NEXT:    s_mov_b32 s1, 0
-; SI-NEXT:  .LBB7_3: ; %endif
+; SI-NEXT:  .LBB7_5: ; %endif
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB7_4:
-; SI-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; SI-NEXT:    s_branch .LBB7_2
 ;
 ; VI-LABEL: ctpop_i64_in_br:
 ; VI:       ; %bb.0: ; %entry
@@ -368,15 +372,24 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB7_4
+; VI-NEXT:    s_cbranch_scc0 .LBB7_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
-; VI-NEXT:    s_cbranch_execnz .LBB7_3
-; VI-NEXT:  .LBB7_2: ; %if
+; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    s_branch .LBB7_3
+; VI-NEXT:  .LBB7_2:
+; VI-NEXT:    s_mov_b64 s[6:7], -1
+; VI-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; VI-NEXT:  .LBB7_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; VI-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s6, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB7_5
+; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
 ; VI-NEXT:    s_mov_b32 s1, 0
-; VI-NEXT:  .LBB7_3: ; %endif
+; VI-NEXT:  .LBB7_5: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
@@ -384,9 +397,6 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB7_4:
-; VI-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; VI-NEXT:    s_branch .LBB7_2
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
index ddb635cabbab15..c7393764d51796 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll
@@ -31,13 +31,13 @@ define void @main(float %arg) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_FMAC_F32_e64_1]], %bb.1
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF]], %bb.0, [[V_ADD_F32_e64_]], %bb.1
-  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1
-  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[PHI2]], implicit $exec
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, [[S_MOV_B32_2]], %bb.1
+  ; CHECK-NEXT:   $scc = COPY [[PHI2]]
+  ; CHECK-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc
   ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
-  ; CHECK-NEXT:   S_CMP_LG_U32 killed [[COPY1]], killed [[S_MOV_B32_3]], implicit-def $scc
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
-  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY2]], implicit-def dead $scc
+  ; CHECK-NEXT:   S_CMP_LG_U32 killed [[S_CSELECT_B32_]], killed [[S_MOV_B32_3]], implicit-def $scc
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $scc
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, killed [[COPY1]], implicit-def dead $scc
   ; CHECK-NEXT:   $vcc_lo = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
index 5cadb65c9c942f..0938155a2fc6c2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 %
   ; GCN-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 2, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
   ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_1]], 0, implicit-def $scc
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]]
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_64 = COPY [[COPY2]]
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
@@ -61,24 +61,28 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 %
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1.out.true:
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-  ; GCN-NEXT:   [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_]], implicit-def dead $scc
+  ; GCN-NEXT:   [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_]], implicit-def dead $scc
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
   ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_3]], %subreg.sub2, killed [[S_MOV_B32_2]], %subreg.sub3
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_XOR_B64_]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
+  ; GCN-NEXT:   $scc = COPY [[S_XOR_B64_]]
+  ; GCN-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]]
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[COPY6]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2.out.else:
-  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1
-  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0
+  ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
   ; GCN-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[S_MOV_B32_5]], %subreg.sub2, killed [[S_MOV_B32_4]], %subreg.sub3
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[COPY3]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_1]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[S_MOV_B32_5]], %subreg.sub2, killed [[S_MOV_B32_4]], %subreg.sub3
+  ; GCN-NEXT:   $scc = COPY [[COPY3]]
+  ; GCN-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_1]]
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[COPY9]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
 entry:
   %0 = and i32 %x, 2
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 5b39cc2e185b7d..3c3d91a703a35f 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -24,9 +24,11 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
   ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
+  ; GCN-NEXT:   $scc = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]]
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[COPY6]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i16 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
@@ -77,9 +79,11 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_CMP_LT_I32 killed [[COPY4]], killed [[S_MOV_B32_2]], implicit-def $scc
   ; GCN-NEXT:   [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[COPY7]], killed [[COPY6]], implicit-def dead $scc
+  ; GCN-NEXT:   $scc = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]]
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[COPY8]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
@@ -133,9 +137,11 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GCN-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
   ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY9]], implicit $exec
-  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY8]], implicit-def dead $scc
-  ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
+  ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY8]], implicit-def dead $scc
+  ; GCN-NEXT:   $scc = COPY [[S_OR_B64_]]
+  ; GCN-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_CSELECT_B32_]]
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[COPY10]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 85ed2914b8c7f5..cce56708b66103 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -13,24 +13,20 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
-; CHECK-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 8
-; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 16
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s0, 24
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_xor_b64 s[4:5], s[8:9], -1
+; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; CHECK-NEXT:    s_xor_b64 s[2:3], s[6:7], -1
 ; CHECK-NEXT:    s_bitcmp1_b32 s1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; CHECK-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CHECK-NEXT:    s_bitcmp1_b32 s14, 8
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[16:17]
 ; CHECK-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
-; CHECK-NEXT:    s_and_b64 s[6:7], exec, s[10:11]
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT:    s_and_b64 s[0:1], exec, s[2:3]
+; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_branch .LBB0_3
 ; CHECK-NEXT:  .LBB0_1: ; in Loop: Header=BB0_3 Depth=1
@@ -41,88 +37,103 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:  .LBB0_2: ; %Flow7
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[22:23]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_12
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_13
 ; CHECK-NEXT:  .LBB0_3: ; %bb7
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT:    s_and_b64 s[16:17], s[10:11], exec
+; CHECK-NEXT:    s_cselect_b32 s16, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s16, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.4: ; %bb8
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b64 vcc, s[4:5]
+; CHECK-NEXT:    s_mov_b64 vcc, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], 0
 ; CHECK-NEXT:    s_mov_b64 s[18:19], -1
-; CHECK-NEXT:    s_mov_b64 s[22:23], s[10:11]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_7
-; CHECK-NEXT:    s_branch .LBB0_8
+; CHECK-NEXT:    s_mov_b64 s[22:23], s[8:9]
+; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_6: ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], -1
 ; CHECK-NEXT:    s_mov_b64 s[18:19], 0
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
-; CHECK-NEXT:  .LBB0_7: ; %bb10
+; CHECK-NEXT:  .LBB0_7: ; %Flow8
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    s_and_b64 s[20:21], s[16:17], exec
+; CHECK-NEXT:    s_cselect_b32 s20, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s20, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_9
+; CHECK-NEXT:  ; %bb.8: ; %bb10
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[18:19], -1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], 0
 ; CHECK-NEXT:    s_mov_b64 s[22:23], s[14:15]
-; CHECK-NEXT:  .LBB0_8: ; %Flow9
+; CHECK-NEXT:  .LBB0_9: ; %Flow9
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    s_and_b64 s[22:23], s[22:23], exec
+; CHECK-NEXT:    s_cselect_b32 s22, 1, 0
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
+; CHECK-NEXT:    s_cmp_lg_u32 s22, 1
 ; CHECK-NEXT:    s_mov_b64 s[22:23], -1
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
-; CHECK-NEXT:  ; %bb.9: ; %bb13
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT:  ; %bb.10: ; %bb13
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[16:17], 0
 ; CHECK-NEXT:    s_mov_b64 s[20:21], 0
-; CHECK-NEXT:    s_mov_b64 vcc, s[6:7]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_11
-; CHECK-NEXT:  ; %bb.10: ; %bb16
+; CHECK-NEXT:    s_mov_b64 vcc, s[2:3]
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_12
+; CHECK-NEXT:  ; %bb.11: ; %bb16
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
 ; CHECK-NEXT:    s_mov_b64 s[22:23], s[12:13]
-; CHECK-NEXT:  .LBB0_11: ; %Flow11
+; CHECK-NEXT:  .LBB0_12: ; %Flow11
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[18:19], 0
 ; CHECK-NEXT:    s_branch .LBB0_2
-; CHECK-NEXT:  .LBB0_12: ; %loop.exit.guard6
+; CHECK-NEXT:  .LBB0_13: ; %loop.exit.guard6
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_xor_b64 s[22:23], s[20:21], -1
 ; CHECK-NEXT:    s_mov_b64 s[20:21], -1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[22:23]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_16
-; CHECK-NEXT:  ; %bb.13: ; %bb14
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_17
+; CHECK-NEXT:  ; %bb.14: ; %bb14
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_15
-; CHECK-NEXT:  ; %bb.14: ; %bb15
+; CHECK-NEXT:    s_and_b64 s[20:21], s[4:5], exec
+; CHECK-NEXT:    s_cselect_b32 s20, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s20, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_16
+; CHECK-NEXT:  ; %bb.15: ; %bb15
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[24:27], 0
-; CHECK-NEXT:  .LBB0_15: ; %Flow
+; CHECK-NEXT:  .LBB0_16: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[20:21], 0
-; CHECK-NEXT:  .LBB0_16: ; %Flow13
+; CHECK-NEXT:  .LBB0_17: ; %Flow13
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_3
-; CHECK-NEXT:  ; %bb.17: ; %loop.exit.guard
+; CHECK-NEXT:    s_and_b64 s[20:21], s[20:21], exec
+; CHECK-NEXT:    s_cselect_b32 s20, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s20, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_3
+; CHECK-NEXT:  ; %bb.18: ; %loop.exit.guard
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_22
-; CHECK-NEXT:  ; %bb.18: ; %loop.exit.guard5
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_23
-; CHECK-NEXT:  ; %bb.19: ; %bb17
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_21
-; CHECK-NEXT:  ; %bb.20: ; %bb19
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:  ; %bb.19: ; %loop.exit.guard5
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[18:19]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_24
+; CHECK-NEXT:  ; %bb.20: ; %bb17
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_22
-; CHECK-NEXT:  .LBB0_21: ; %bb18
+; CHECK-NEXT:  ; %bb.21: ; %bb19
+; CHECK-NEXT:    s_and_b64 s[0:1], s[4:5], exec
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 1
+; CHECK-NEXT:    s_cbranch_scc0 .LBB0_23
+; CHECK-NEXT:  .LBB0_22: ; %bb18
 ; CHECK-NEXT:    s_endpgm
-; CHECK-NEXT:  .LBB0_22: ; %bb20
-; CHECK-NEXT:  .LBB0_23: ; %bb12
+; CHECK-NEXT:  .LBB0_23: ; %bb20
+; CHECK-NEXT:  .LBB0_24: ; %bb12
 bb:
   br label %bb6
 
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d6bde79802847c..59295276a95ffd 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -93,17 +93,18 @@ bb:
 define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_i32 s0, s2, s2
-; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
+; GFX7-NEXT:    s_add_i32 s0, s4, s4
+; GFX7-NEXT:    s_cmp_lt_u32 s0, s4
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX7-NEXT:    s_addc_u32 s0, s2, 0
-; GFX7-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
-; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX7-NEXT:    s_addc_u32 s0, s4, 0
+; GFX7-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX7-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
@@ -120,16 +121,17 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s2, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s0, s2
+; GFX9-NEXT:    s_add_i32 s0, s4, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s0, s4
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT:    s_addc_u32 s0, s2, 0
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
-; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX9-NEXT:    s_addc_u32 s0, s4, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX9-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -151,11 +153,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    s_add_i32 s1, s0, s0
 ; GFX10-NEXT:    s_cmp_lt_u32 s1, s0
 ; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX10-NEXT:    s_and_b32 s2, s1, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
-; GFX10-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s0, v0
-; GFX10-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX10-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -178,12 +181,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lt_u32 s1, s0
 ; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    s_and_b32 s2, s1, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s0, v0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 9
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index efbbe2b27f10f9..c185020ccce6ef 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -33,13 +33,17 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v4, v5
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB0_3
-; SI-NEXT:    s_branch .LBB0_4
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch .LBB0_3
 ; SI-NEXT:  .LBB0_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB0_3: ; %T
+; SI-NEXT:  .LBB0_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB0_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -64,7 +68,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v1
-; SI-NEXT:  .LBB0_4: ; %exit
+; SI-NEXT:  .LBB0_5: ; %exit
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
 ; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
@@ -93,14 +97,20 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_cbranch_execz .LBB0_3
-; GFX9-NEXT:    s_branch .LBB0_4
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch .LBB0_3
 ; GFX9-NEXT:  .LBB0_2:
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX9-NEXT:  .LBB0_3: ; %T
+; GFX9-NEXT:  .LBB0_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB0_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:  .LBB0_4: ; %exit
+; GFX9-NEXT:  .LBB0_5: ; %exit
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -120,14 +130,21 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_cbranch_execz .LBB0_3
-; GFX11-NEXT:    s_branch .LBB0_4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_branch .LBB0_3
 ; GFX11-NEXT:  .LBB0_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX11-NEXT:  .LBB0_3: ; %T
+; GFX11-NEXT:  .LBB0_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB0_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB0_4: ; %exit
+; GFX11-NEXT:  .LBB0_5: ; %exit
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -190,13 +207,17 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v3
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB1_3
-; SI-NEXT:    s_branch .LBB1_4
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch .LBB1_3
 ; SI-NEXT:  .LBB1_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB1_3: ; %T
+; SI-NEXT:  .LBB1_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB1_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -221,7 +242,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v5, v4, v0
 ; SI-NEXT:    v_or_b32_e32 v4, v2, v1
-; SI-NEXT:  .LBB1_4: ; %exit
+; SI-NEXT:  .LBB1_5: ; %exit
 ; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
 ; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
@@ -251,14 +272,20 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_cbranch_execz .LBB1_3
-; GFX9-NEXT:    s_branch .LBB1_4
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch .LBB1_3
 ; GFX9-NEXT:  .LBB1_2:
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX9-NEXT:  .LBB1_3: ; %T
+; GFX9-NEXT:  .LBB1_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:  .LBB1_4: ; %exit
+; GFX9-NEXT:  .LBB1_5: ; %exit
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -278,14 +305,21 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_cbranch_execz .LBB1_3
-; GFX11-NEXT:    s_branch .LBB1_4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_branch .LBB1_3
 ; GFX11-NEXT:  .LBB1_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX11-NEXT:  .LBB1_3: ; %T
+; GFX11-NEXT:  .LBB1_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB1_4: ; %exit
+; GFX11-NEXT:  .LBB1_5: ; %exit
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -351,15 +385,19 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB2_3
-; SI-NEXT:    s_branch .LBB2_4
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch .LBB2_3
 ; SI-NEXT:  .LBB2_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB2_3: ; %T
+; SI-NEXT:  .LBB2_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB2_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -387,7 +425,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:  .LBB2_4: ; %exit
+; SI-NEXT:  .LBB2_5: ; %exit
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -412,14 +450,20 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_cbranch_execz .LBB2_3
-; GFX9-NEXT:    s_branch .LBB2_4
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch .LBB2_3
 ; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX9-NEXT:  .LBB2_3: ; %T
+; GFX9-NEXT:  .LBB2_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:  .LBB2_4: ; %exit
+; GFX9-NEXT:  .LBB2_5: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
@@ -442,14 +486,21 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_cbranch_execz .LBB2_3
-; GFX11-NEXT:    s_branch .LBB2_4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_branch .LBB2_3
 ; GFX11-NEXT:  .LBB2_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX11-NEXT:  .LBB2_3: ; %T
+; GFX11-NEXT:  .LBB2_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB2_4: ; %exit
+; GFX11-NEXT:  .LBB2_5: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
@@ -530,13 +581,17 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v4, v5
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB3_3
-; SI-NEXT:    s_branch .LBB3_4
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch .LBB3_3
 ; SI-NEXT:  .LBB3_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB3_3: ; %T
+; SI-NEXT:  .LBB3_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB3_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -577,7 +632,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v0
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v1
-; SI-NEXT:  .LBB3_4: ; %exit
+; SI-NEXT:  .LBB3_5: ; %exit
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v2
 ; SI-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; SI-NEXT:    v_bfe_i32 v2, v3, 0, 16
@@ -609,17 +664,23 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX9-NEXT:    s_cbranch_execz .LBB3_3
-; GFX9-NEXT:    s_branch .LBB3_4
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch .LBB3_3
 ; GFX9-NEXT:  .LBB3_2:
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-; GFX9-NEXT:  .LBB3_3: ; %T
+; GFX9-NEXT:  .LBB3_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX9-NEXT:  .LBB3_4: ; %exit
+; GFX9-NEXT:  .LBB3_5: ; %exit
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
@@ -642,16 +703,23 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_cbranch_execz .LBB3_3
-; GFX11-NEXT:    s_branch .LBB3_4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_branch .LBB3_3
 ; GFX11-NEXT:  .LBB3_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB3_3: ; %T
+; GFX11-NEXT:  .LBB3_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB3_4: ; %exit
+; GFX11-NEXT:  .LBB3_5: ; %exit
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -731,13 +799,17 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v3
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB4_3
-; SI-NEXT:    s_branch .LBB4_4
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch .LBB4_3
 ; SI-NEXT:  .LBB4_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB4_3: ; %T
+; SI-NEXT:  .LBB4_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB4_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -778,7 +850,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v5, v4, v0
 ; SI-NEXT:    v_or_b32_e32 v4, v2, v1
-; SI-NEXT:  .LBB4_4: ; %exit
+; SI-NEXT:  .LBB4_5: ; %exit
 ; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v4
 ; SI-NEXT:    v_ashr_i64 v[0:1], v[4:5], 48
 ; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
@@ -811,17 +883,23 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX9-NEXT:    s_cbranch_execz .LBB4_3
-; GFX9-NEXT:    s_branch .LBB4_4
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch .LBB4_3
 ; GFX9-NEXT:  .LBB4_2:
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-; GFX9-NEXT:  .LBB4_3: ; %T
+; GFX9-NEXT:  .LBB4_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX9-NEXT:  .LBB4_4: ; %exit
+; GFX9-NEXT:  .LBB4_5: ; %exit
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
@@ -844,16 +922,23 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_cbranch_execz .LBB4_3
-; GFX11-NEXT:    s_branch .LBB4_4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_branch .LBB4_3
 ; GFX11-NEXT:  .LBB4_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB4_3: ; %T
+; GFX11-NEXT:  .LBB4_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB4_4: ; %exit
+; GFX11-NEXT:  .LBB4_5: ; %exit
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -936,15 +1021,19 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB5_3
-; SI-NEXT:    s_branch .LBB5_4
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch .LBB5_3
 ; SI-NEXT:  .LBB5_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB5_3: ; %T
+; SI-NEXT:  .LBB5_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB5_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -988,7 +1077,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:  .LBB5_4: ; %exit
+; SI-NEXT:  .LBB5_5: ; %exit
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -1016,17 +1105,23 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX9-NEXT:    s_cbranch_execz .LBB5_3
-; GFX9-NEXT:    s_branch .LBB5_4
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch .LBB5_3
 ; GFX9-NEXT:  .LBB5_2:
+; GFX9-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-; GFX9-NEXT:  .LBB5_3: ; %T
+; GFX9-NEXT:  .LBB5_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB5_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX9-NEXT:  .LBB5_4: ; %exit
+; GFX9-NEXT:  .LBB5_5: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1052,16 +1147,23 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_cbranch_execz .LBB5_3
-; GFX11-NEXT:    s_branch .LBB5_4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_branch .LBB5_3
 ; GFX11-NEXT:  .LBB5_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB5_3: ; %T
+; GFX11-NEXT:  .LBB5_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB5_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB5_4: ; %exit
+; GFX11-NEXT:  .LBB5_5: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
@@ -1235,13 +1337,17 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    v_or_b32_e32 v4, v8, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v7, v9
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v11
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB7_3
-; SI-NEXT:    s_branch .LBB7_4
+; SI-NEXT:    s_mov_b64 s[34:35], 0
+; SI-NEXT:    s_branch .LBB7_3
 ; SI-NEXT:  .LBB7_2:
+; SI-NEXT:    s_mov_b64 s[34:35], -1
 ; SI-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB7_3: ; %T
+; SI-NEXT:  .LBB7_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; SI-NEXT:    s_cselect_b32 s34, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s34, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB7_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
 ; SI-NEXT:    s_mov_b32 s37, s38
@@ -1285,7 +1391,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    v_or_b32_e32 v4, v7, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v6, v9
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v10
-; SI-NEXT:  .LBB7_4: ; %exit
+; SI-NEXT:  .LBB7_5: ; %exit
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
@@ -1340,17 +1446,23 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX9-NEXT:    s_cbranch_execz .LBB7_3
-; GFX9-NEXT:    s_branch .LBB7_4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    s_branch .LBB7_3
 ; GFX9-NEXT:  .LBB7_2:
+; GFX9-NEXT:    s_mov_b64 s[34:35], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-; GFX9-NEXT:  .LBB7_3: ; %T
+; GFX9-NEXT:  .LBB7_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s34, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX9-NEXT:  .LBB7_4: ; %exit
+; GFX9-NEXT:  .LBB7_5: ; %exit
 ; GFX9-NEXT:    s_movk_i32 s35, 0x3801
 ; GFX9-NEXT:    s_movk_i32 s34, 0x3800
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
@@ -1395,17 +1507,22 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB7_3
-; GFX11-NEXT:    s_branch .LBB7_4
+; GFX11-NEXT:    s_branch .LBB7_3
 ; GFX11-NEXT:  .LBB7_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB7_3: ; %T
+; GFX11-NEXT:  .LBB7_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB7_4: ; %exit
+; GFX11-NEXT:  .LBB7_5: ; %exit
 ; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
@@ -1516,10 +1633,10 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v11
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB8_3
-; SI-NEXT:    s_branch .LBB8_4
+; SI-NEXT:    s_mov_b64 s[34:35], 0
+; SI-NEXT:    s_branch .LBB8_3
 ; SI-NEXT:  .LBB8_2:
+; SI-NEXT:    s_mov_b64 s[34:35], -1
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr8
@@ -1528,8 +1645,12 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:  .LBB8_3: ; %T
+; SI-NEXT:  .LBB8_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; SI-NEXT:    s_cselect_b32 s34, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s34, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB8_5
+; SI-NEXT:  ; %bb.4: ; %T
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
 ; SI-NEXT:    s_mov_b32 s37, s38
@@ -1581,7 +1702,7 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:  .LBB8_4: ; %exit
+; SI-NEXT:  .LBB8_5: ; %exit
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v8
@@ -1633,17 +1754,23 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
-; GFX9-NEXT:    s_cbranch_execz .LBB8_3
-; GFX9-NEXT:    s_branch .LBB8_4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    s_branch .LBB8_3
 ; GFX9-NEXT:  .LBB8_2:
+; GFX9-NEXT:    s_mov_b64 s[34:35], -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-; GFX9-NEXT:  .LBB8_3: ; %T
+; GFX9-NEXT:  .LBB8_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s34, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB8_5
+; GFX9-NEXT:  ; %bb.4: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
-; GFX9-NEXT:  .LBB8_4: ; %exit
+; GFX9-NEXT:  .LBB8_5: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3800
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3d00
@@ -1686,17 +1813,22 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB8_3
-; GFX11-NEXT:    s_branch .LBB8_4
+; GFX11-NEXT:    s_branch .LBB8_3
 ; GFX11-NEXT:  .LBB8_2:
+; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB8_3: ; %T
+; GFX11-NEXT:  .LBB8_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB8_5
+; GFX11-NEXT:  ; %bb.4: ; %T
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB8_4: ; %exit
+; GFX11-NEXT:  .LBB8_5: ; %exit
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 54ec7578700df8..966f9d701f9dea 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -33,16 +33,14 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 1
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, 1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 2
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 2, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 4, v0, vcc
+; GCN-NEXT:    s_cselect_b32 s2, s2, 4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -730,10 +728,8 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 1
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GCN-NEXT:    s_cselect_b32 s2, 1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 2
-; GCN-NEXT:    v_readfirstlane_b32 s2, v0
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 3
 ; GCN-NEXT:    s_cselect_b32 s2, s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 13367d3bb36e29..41b7f360621a40 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1518,26 +1518,23 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
 ; SI-NEXT:    s_lshr_b32 s4, s3, 8
 ; SI-NEXT:    s_and_b32 s5, s3, 0x1ff
-; SI-NEXT:    s_and_b32 s6, s4, 0xffe
+; SI-NEXT:    s_and_b32 s4, s4, 0xffe
 ; SI-NEXT:    s_or_b32 s2, s5, s2
 ; SI-NEXT:    s_cmp_lg_u32 s2, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; SI-NEXT:    v_readfirstlane_b32 s2, v1
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
 ; SI-NEXT:    s_bfe_u32 s5, s3, 0xb0014
-; SI-NEXT:    s_or_b32 s2, s6, s2
 ; SI-NEXT:    s_sub_i32 s6, 0x3f1, s5
+; SI-NEXT:    s_or_b32 s2, s4, s2
 ; SI-NEXT:    v_med3_i32 v1, s6, 0, 13
 ; SI-NEXT:    s_or_b32 s4, s2, 0x1000
 ; SI-NEXT:    v_readfirstlane_b32 s6, v1
-; SI-NEXT:    s_lshr_b32 s6, s4, s6
-; SI-NEXT:    v_lshl_b32_e32 v1, s6, v1
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-NEXT:    s_lshr_b32 s7, s4, s6
+; SI-NEXT:    s_lshl_b32 s6, s7, s6
+; SI-NEXT:    s_cmp_lg_u32 s6, s4
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-NEXT:    s_add_i32 s8, s5, 0xfffffc10
-; SI-NEXT:    v_readfirstlane_b32 s4, v1
 ; SI-NEXT:    s_lshl_b32 s5, s8, 12
-; SI-NEXT:    s_or_b32 s4, s6, s4
+; SI-NEXT:    s_or_b32 s4, s7, s4
 ; SI-NEXT:    s_or_b32 s5, s2, s5
 ; SI-NEXT:    s_cmp_lt_i32 s8, 1
 ; SI-NEXT:    s_cselect_b32 s9, s4, s5
@@ -1552,20 +1549,17 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    s_addc_u32 s4, s6, 0
 ; SI-NEXT:    s_cmp_lt_i32 s8, 31
-; SI-NEXT:    s_cselect_b32 s6, s4, 0x7c00
+; SI-NEXT:    s_cselect_b32 s4, s4, 0x7c00
 ; SI-NEXT:    s_cmp_lg_u32 s2, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 9, v1
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-NEXT:    s_lshl_b32 s2, s2, 9
+; SI-NEXT:    s_or_b32 s2, s2, 0x7c00
 ; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
-; SI-NEXT:    v_or_b32_e32 v1, 0x7c00, v1
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s2, s3, 16
-; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; SI-NEXT:    s_and_b32 s2, s2, 0x8000
-; SI-NEXT:    v_or_b32_e32 v1, s2, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_cselect_b32 s2, s2, s4
+; SI-NEXT:    s_lshr_b32 s3, s3, 16
+; SI-NEXT:    s_and_b32 s3, s3, 0x8000
+; SI-NEXT:    s_or_b32 s2, s3, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; SI-NEXT:    s_brev_b32 s2, -2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_bfi_b32 v0, s2, v1, v0
@@ -1581,28 +1575,25 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s0, s7, 8
 ; VI-NEXT:    s_and_b32 s1, s7, 0x1ff
-; VI-NEXT:    s_and_b32 s2, s0, 0xffe
-; VI-NEXT:    s_or_b32 s0, s1, s6
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    s_bfe_u32 s1, s7, 0xb0014
+; VI-NEXT:    s_and_b32 s0, s0, 0xffe
+; VI-NEXT:    s_or_b32 s1, s1, s6
+; VI-NEXT:    s_cmp_lg_u32 s1, 0
+; VI-NEXT:    s_cselect_b32 s1, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_or_b32 s4, s2, s0
+; VI-NEXT:    s_or_b32 s4, s0, s1
+; VI-NEXT:    s_bfe_u32 s1, s7, 0xb0014
 ; VI-NEXT:    s_sub_i32 s2, 0x3f1, s1
 ; VI-NEXT:    v_med3_i32 v2, s2, 0, 13
 ; VI-NEXT:    s_or_b32 s0, s4, 0x1000
 ; VI-NEXT:    v_readfirstlane_b32 s2, v2
-; VI-NEXT:    s_lshr_b32 s2, s0, s2
-; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v2
+; VI-NEXT:    s_lshr_b32 s3, s0, s2
+; VI-NEXT:    s_lshl_b32 s2, s3, s2
+; VI-NEXT:    s_cmp_lg_u32 s2, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    s_add_i32 s5, s1, 0xfffffc10
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
 ; VI-NEXT:    s_lshl_b32 s1, s5, 12
-; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    s_or_b32 s0, s3, s0
 ; VI-NEXT:    s_or_b32 s1, s4, s1
 ; VI-NEXT:    s_cmp_lt_i32 s5, 1
 ; VI-NEXT:    s_cselect_b32 s6, s0, s1
@@ -1616,19 +1607,17 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_addc_u32 s0, s2, 0
 ; VI-NEXT:    s_cmp_lt_i32 s5, 31
-; VI-NEXT:    s_cselect_b32 s2, s0, 0x7c00
+; VI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 9, v2
+; VI-NEXT:    s_cselect_b32 s1, 1, 0
+; VI-NEXT:    s_lshl_b32 s1, s1, 9
+; VI-NEXT:    s_or_b32 s1, s1, 0x7c00
 ; VI-NEXT:    s_cmpk_eq_i32 s5, 0x40f
-; VI-NEXT:    v_or_b32_e32 v2, 0x7c00, v2
-; VI-NEXT:    v_mov_b32_e32 v3, s2
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    s_movk_i32 s1, 0x7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s8
-; VI-NEXT:    v_bfi_b32 v2, s0, v2, v3
+; VI-NEXT:    v_bfi_b32 v2, s1, v2, v3
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -1640,26 +1629,23 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX9-NEXT:    s_and_b32 s1, s7, 0x1ff
-; GFX9-NEXT:    s_and_b32 s2, s0, 0xffe
-; GFX9-NEXT:    s_or_b32 s0, s1, s6
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xffe
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_or_b32 s6, s0, s1
 ; GFX9-NEXT:    s_bfe_u32 s1, s7, 0xb0014
-; GFX9-NEXT:    s_or_b32 s6, s2, s0
 ; GFX9-NEXT:    s_sub_i32 s2, 0x3f1, s1
 ; GFX9-NEXT:    v_med3_i32 v1, s2, 0, 13
 ; GFX9-NEXT:    s_or_b32 s0, s6, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    s_lshr_b32 s2, s0, s2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    s_lshr_b32 s3, s0, s2
+; GFX9-NEXT:    s_lshl_b32 s2, s3, s2
+; GFX9-NEXT:    s_cmp_lg_u32 s2, s0
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_add_i32 s7, s1, 0xfffffc10
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    s_lshl_b32 s1, s7, 12
-; GFX9-NEXT:    s_or_b32 s0, s2, s0
+; GFX9-NEXT:    s_or_b32 s0, s3, s0
 ; GFX9-NEXT:    s_or_b32 s1, s6, s1
 ; GFX9-NEXT:    s_cmp_lt_i32 s7, 1
 ; GFX9-NEXT:    s_cselect_b32 s9, s0, s1
@@ -1673,19 +1659,17 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX9-NEXT:    s_cmp_lt_i32 s7, 31
-; GFX9-NEXT:    s_cselect_b32 s2, s0, 0x7c00
+; GFX9-NEXT:    s_cselect_b32 s0, s0, 0x7c00
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 9, v1
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX9-NEXT:    s_or_b32 s1, s1, 0x7c00
 ; GFX9-NEXT:    s_cmpk_eq_i32 s7, 0x40f
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x7c00, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_bfi_b32 v1, s0, v1, v2
+; GFX9-NEXT:    v_bfi_b32 v1, s1, v1, v2
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1694,62 +1678,56 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s7, 0x1ff
 ; GFX11-NEXT:    s_lshr_b32 s2, s7, 8
 ; GFX11-NEXT:    s_or_b32 s1, s1, s6
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffe
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX11-NEXT:    s_bfe_u32 s1, s7, 0xb0014
-; GFX11-NEXT:    s_sub_i32 s3, 0x3f1, s1
-; GFX11-NEXT:    s_addk_i32 s1, 0xfc10
-; GFX11-NEXT:    v_med3_i32 v1, s3, 0, 13
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:    s_lshl_b32 s7, s1, 12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_or_b32 s3, s2, 0x1000
-; GFX11-NEXT:    s_or_b32 s7, s2, s7
-; GFX11-NEXT:    s_lshr_b32 s6, s3, s6
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s3, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:    s_or_b32 s3, s6, s3
-; GFX11-NEXT:    s_cmp_lt_i32 s1, 1
-; GFX11-NEXT:    s_cselect_b32 s3, s3, s7
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    s_bfe_u32 s3, s7, 0xb0014
+; GFX11-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-NEXT:    s_sub_i32 s6, 0x3f1, s3
+; GFX11-NEXT:    s_or_b32 s2, s1, 0x1000
+; GFX11-NEXT:    v_med3_i32 v0, s6, 0, 13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_lshr_b32 s7, s2, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s6, s3, 7
+; GFX11-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX11-NEXT:    s_cmp_lg_u32 s6, s2
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX11-NEXT:    s_or_b32 s2, s7, s2
+; GFX11-NEXT:    s_lshl_b32 s6, s3, 12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s6, s1, s6
+; GFX11-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX11-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX11-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 2
 ; GFX11-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-NEXT:    s_addc_u32 s3, s3, 0
-; GFX11-NEXT:    s_cmp_lt_i32 s1, 31
-; GFX11-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_cmpk_eq_i32 s1, 0x40f
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v0, vcc_lo
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s0
+; GFX11-NEXT:    s_addc_u32 s2, s2, 0
+; GFX11-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX11-NEXT:    s_cselect_b32 s2, s2, 0x7c00
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 9
+; GFX11-NEXT:    s_or_b32 s1, s1, 0x7c00
+; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
 ; GFX11-NEXT:    global_store_b16 v1, v0, s[4:5]
 ; GFX11-NEXT:    s_endpgm
   %mag.trunc = fptrunc double %mag to half
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 839f4a18508e5b..8c0c643aac3471 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -18,16 +18,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB0_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB0_4
-; GCN1-NEXT:  .LBB0_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB0_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB0_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -35,8 +30,13 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB0_2
-; GCN1-NEXT:  .LBB0_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB0_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB0_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -52,6 +52,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB0_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_add_i64_offset:
@@ -68,16 +69,11 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB0_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB0_4
-; GCN2-NEXT:  .LBB0_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB0_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB0_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -85,8 +81,13 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB0_2
-; GCN2-NEXT:  .LBB0_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB0_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB0_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -101,6 +102,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB0_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_offset:
@@ -111,23 +113,24 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB0_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB0_4
-; GFX12-NEXT:  .LBB0_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB0_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_2
-; GFX12-NEXT:  .LBB0_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB0_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB0_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -136,6 +139,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB0_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -159,9 +163,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB1_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -170,11 +174,17 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB1_3
-; GCN1-NEXT:    s_branch .LBB1_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB1_3
 ; GCN1-NEXT:  .LBB1_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB1_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB1_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB1_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -190,7 +200,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB1_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB1_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -211,9 +221,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB1_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -222,11 +232,17 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB1_3
-; GCN2-NEXT:    s_branch .LBB1_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB1_3
 ; GCN2-NEXT:  .LBB1_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB1_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB1_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB1_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -241,7 +257,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB1_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB1_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -252,25 +268,32 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB1_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB1_3
-; GFX12-NEXT:    s_branch .LBB1_4
+; GFX12-NEXT:    s_branch .LBB1_3
 ; GFX12-NEXT:  .LBB1_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB1_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB1_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -279,7 +302,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB1_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB1_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -309,16 +332,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB2_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB2_4
-; GCN1-NEXT:  .LBB2_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB2_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB2_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -326,8 +344,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB2_2
-; GCN1-NEXT:  .LBB2_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB2_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB2_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -343,36 +366,33 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB2_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_add_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB2_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB2_4
-; GCN2-NEXT:  .LBB2_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB2_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB2_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -380,8 +400,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB2_2
-; GCN2-NEXT:  .LBB2_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB2_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB2_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -396,6 +421,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB2_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_addr64_offset:
@@ -410,24 +436,25 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB2_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB2_4
-; GFX12-NEXT:  .LBB2_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB2_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB2_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB2_2
-; GFX12-NEXT:  .LBB2_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB2_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -436,6 +463,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB2_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -462,9 +490,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB3_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB3_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -473,11 +501,17 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB3_3
-; GCN1-NEXT:    s_branch .LBB3_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB3_3
 ; GCN1-NEXT:  .LBB3_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB3_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB3_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB3_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s9
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -493,7 +527,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB3_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB3_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -507,8 +541,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -516,9 +550,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB3_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB3_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -527,11 +561,17 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB3_3
-; GCN2-NEXT:    s_branch .LBB3_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB3_3
 ; GCN2-NEXT:  .LBB3_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB3_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB3_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB3_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -546,7 +586,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB3_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB3_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -562,21 +602,28 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB3_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB3_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB3_3
-; GFX12-NEXT:    s_branch .LBB3_4
+; GFX12-NEXT:    s_branch .LBB3_3
 ; GFX12-NEXT:  .LBB3_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB3_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB3_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -585,7 +632,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s4
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB3_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB3_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -610,16 +657,11 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB4_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB4_4
-; GCN1-NEXT:  .LBB4_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -627,8 +669,13 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB4_2
-; GCN1-NEXT:  .LBB4_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB4_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -644,6 +691,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB4_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_add_i64:
@@ -658,16 +706,11 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB4_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB4_4
-; GCN2-NEXT:  .LBB4_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -675,8 +718,13 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB4_2
-; GCN2-NEXT:  .LBB4_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB4_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB4_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -691,6 +739,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB4_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64:
@@ -699,24 +748,25 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB4_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB4_4
-; GFX12-NEXT:  .LBB4_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB4_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB4_2
-; GFX12-NEXT:  .LBB4_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB4_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -724,6 +774,7 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB4_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst
@@ -744,9 +795,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB5_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -755,11 +806,17 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB5_3
-; GCN1-NEXT:    s_branch .LBB5_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB5_3
 ; GCN1-NEXT:  .LBB5_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB5_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB5_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB5_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -775,7 +832,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB5_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB5_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -794,9 +851,9 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB5_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -805,11 +862,17 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB5_3
-; GCN2-NEXT:    s_branch .LBB5_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB5_3
 ; GCN2-NEXT:  .LBB5_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB5_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB5_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB5_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -824,7 +887,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB5_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB5_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -838,21 +901,28 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB5_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB5_3
-; GFX12-NEXT:    s_branch .LBB5_4
+; GFX12-NEXT:    s_branch .LBB5_3
 ; GFX12-NEXT:  .LBB5_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB5_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB5_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -860,7 +930,7 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB5_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB5_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -887,16 +957,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB6_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB6_4
-; GCN1-NEXT:  .LBB6_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB6_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB6_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -904,8 +969,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB6_2
-; GCN1-NEXT:  .LBB6_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB6_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB6_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -921,6 +991,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB6_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_add_i64_addr64:
@@ -939,16 +1010,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB6_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB6_4
-; GCN2-NEXT:  .LBB6_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB6_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB6_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -956,8 +1022,13 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB6_2
-; GCN2-NEXT:  .LBB6_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB6_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB6_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -972,6 +1043,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB6_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_add_i64_addr64:
@@ -979,30 +1051,31 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB6_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB6_4
-; GFX12-NEXT:  .LBB6_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB6_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB6_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB6_2
-; GFX12-NEXT:  .LBB6_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB6_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1011,6 +1084,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB6_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -1034,9 +1108,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -1045,11 +1119,17 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB7_3
-; GCN1-NEXT:    s_branch .LBB7_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB7_3
 ; GCN1-NEXT:  .LBB7_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB7_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB7_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB7_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s9
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -1065,7 +1145,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB7_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB7_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1086,9 +1166,9 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -1097,11 +1177,17 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB7_3
-; GCN2-NEXT:    s_branch .LBB7_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB7_3
 ; GCN2-NEXT:  .LBB7_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB7_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB7_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB7_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -1116,7 +1202,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB7_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB7_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1125,27 +1211,34 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_add_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB7_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB7_3
-; GFX12-NEXT:    s_branch .LBB7_4
+; GFX12-NEXT:    s_branch .LBB7_3
 ; GFX12-NEXT:  .LBB7_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB7_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1154,7 +1247,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s4
 ; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB7_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB7_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1180,16 +1273,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB8_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB8_4
-; GCN1-NEXT:  .LBB8_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB8_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB8_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -1197,8 +1285,13 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB8_2
-; GCN1-NEXT:  .LBB8_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB8_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB8_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1213,6 +1306,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB8_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_and_i64_offset:
@@ -1229,16 +1323,11 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB8_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB8_4
-; GCN2-NEXT:  .LBB8_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB8_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB8_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -1246,8 +1335,13 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB8_2
-; GCN2-NEXT:  .LBB8_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB8_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB8_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -1261,6 +1355,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB8_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_offset:
@@ -1271,23 +1366,24 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB8_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB8_4
-; GFX12-NEXT:  .LBB8_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB8_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB8_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB8_2
-; GFX12-NEXT:  .LBB8_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB8_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB8_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1296,6 +1392,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB8_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -1319,9 +1416,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB9_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -1330,11 +1427,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB9_3
-; GCN1-NEXT:    s_branch .LBB9_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB9_3
 ; GCN1-NEXT:  .LBB9_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB9_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB9_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB9_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s2, -1
@@ -1349,7 +1452,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_and_b32_e32 v5, s1, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB9_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB9_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1370,9 +1473,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB9_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -1381,11 +1484,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB9_3
-; GCN2-NEXT:    s_branch .LBB9_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB9_3
 ; GCN2-NEXT:  .LBB9_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB9_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB9_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB9_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -1399,7 +1508,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_and_b32_e32 v5, s1, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB9_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB9_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1410,25 +1519,32 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB9_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB9_3
-; GFX12-NEXT:    s_branch .LBB9_4
+; GFX12-NEXT:    s_branch .LBB9_3
 ; GFX12-NEXT:  .LBB9_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB9_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB9_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -1437,7 +1553,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_and_b32_e32 v3, s1, v1
 ; GFX12-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB9_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB9_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1467,16 +1583,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB10_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB10_4
-; GCN1-NEXT:  .LBB10_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB10_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB10_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -1484,8 +1595,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB10_2
-; GCN1-NEXT:  .LBB10_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB10_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB10_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1500,36 +1616,33 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB10_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_and_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB10_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB10_4
-; GCN2-NEXT:  .LBB10_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB10_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB10_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -1537,8 +1650,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB10_2
-; GCN2-NEXT:  .LBB10_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB10_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB10_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -1552,6 +1670,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB10_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_addr64_offset:
@@ -1566,24 +1685,25 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB10_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB10_4
-; GFX12-NEXT:  .LBB10_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB10_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB10_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB10_2
-; GFX12-NEXT:  .LBB10_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB10_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB10_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1592,6 +1712,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB10_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -1618,9 +1739,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB11_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB11_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -1629,11 +1750,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB11_3
-; GCN1-NEXT:    s_branch .LBB11_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB11_3
 ; GCN1-NEXT:  .LBB11_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB11_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB11_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB11_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1648,7 +1775,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_and_b32_e32 v5, s9, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB11_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB11_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1662,8 +1789,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -1671,9 +1798,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB11_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB11_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -1682,11 +1809,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB11_3
-; GCN2-NEXT:    s_branch .LBB11_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB11_3
 ; GCN2-NEXT:  .LBB11_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB11_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB11_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB11_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -1700,7 +1833,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_and_b32_e32 v5, s9, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB11_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB11_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1716,21 +1849,28 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB11_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB11_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB11_3
-; GFX12-NEXT:    s_branch .LBB11_4
+; GFX12-NEXT:    s_branch .LBB11_3
 ; GFX12-NEXT:  .LBB11_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB11_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB11_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB11_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -1739,7 +1879,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_and_b32_e32 v3, s5, v1
 ; GFX12-NEXT:    v_and_b32_e32 v2, s4, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB11_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB11_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -1764,16 +1904,11 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB12_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB12_4
-; GCN1-NEXT:  .LBB12_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB12_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB12_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -1781,8 +1916,13 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB12_2
-; GCN1-NEXT:  .LBB12_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB12_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB12_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s4, -1
@@ -1797,6 +1937,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB12_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_and_i64:
@@ -1811,16 +1952,11 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB12_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB12_4
-; GCN2-NEXT:  .LBB12_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB12_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB12_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -1828,8 +1964,13 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB12_2
-; GCN2-NEXT:  .LBB12_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB12_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB12_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -1843,6 +1984,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB12_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64:
@@ -1851,24 +1993,25 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB12_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB12_4
-; GFX12-NEXT:  .LBB12_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB12_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB12_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX12-NEXT:  .LBB12_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB12_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB12_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -1876,6 +2019,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB12_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
@@ -1896,9 +2040,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB13_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -1907,11 +2051,17 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB13_3
-; GCN1-NEXT:    s_branch .LBB13_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB13_3
 ; GCN1-NEXT:  .LBB13_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB13_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB13_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB13_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s4, -1
@@ -1926,7 +2076,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_and_b32_e32 v5, s1, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB13_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB13_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1945,9 +2095,9 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB13_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -1956,11 +2106,17 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB13_3
-; GCN2-NEXT:    s_branch .LBB13_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB13_3
 ; GCN2-NEXT:  .LBB13_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB13_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB13_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB13_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -1974,7 +2130,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_and_b32_e32 v5, s1, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB13_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB13_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1988,21 +2144,28 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB13_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB13_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB13_3
-; GFX12-NEXT:    s_branch .LBB13_4
+; GFX12-NEXT:    s_branch .LBB13_3
 ; GFX12-NEXT:  .LBB13_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB13_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB13_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB13_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -2010,7 +2173,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_and_b32_e32 v3, s1, v1
 ; GFX12-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB13_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB13_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2037,16 +2200,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB14_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB14_4
-; GCN1-NEXT:  .LBB14_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB14_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB14_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -2054,8 +2212,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB14_2
-; GCN1-NEXT:  .LBB14_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB14_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB14_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2070,6 +2233,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_and_i64_addr64:
@@ -2088,16 +2252,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB14_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB14_4
-; GCN2-NEXT:  .LBB14_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB14_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB14_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -2105,8 +2264,13 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB14_2
-; GCN2-NEXT:  .LBB14_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB14_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB14_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2120,6 +2284,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_and_i64_addr64:
@@ -2127,30 +2292,31 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB14_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB14_4
-; GFX12-NEXT:  .LBB14_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB14_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB14_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB14_2
-; GFX12-NEXT:  .LBB14_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB14_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB14_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2159,6 +2325,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_and_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2182,9 +2349,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB15_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -2193,11 +2360,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB15_3
-; GCN1-NEXT:    s_branch .LBB15_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB15_3
 ; GCN1-NEXT:  .LBB15_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB15_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB15_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB15_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2212,7 +2385,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_and_b32_e32 v5, s9, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB15_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB15_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2233,9 +2406,9 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB15_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -2244,11 +2417,17 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB15_3
-; GCN2-NEXT:    s_branch .LBB15_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB15_3
 ; GCN2-NEXT:  .LBB15_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB15_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB15_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB15_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -2262,7 +2441,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_and_b32_e32 v5, s9, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB15_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB15_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2271,27 +2450,34 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_and_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB15_3
-; GFX12-NEXT:    s_branch .LBB15_4
+; GFX12-NEXT:    s_branch .LBB15_3
 ; GFX12-NEXT:  .LBB15_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB15_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB15_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2300,7 +2486,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_and_b32_e32 v3, s5, v1
 ; GFX12-NEXT:    v_and_b32_e32 v2, s4, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB15_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB15_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2326,16 +2512,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB16_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB16_4
-; GCN1-NEXT:  .LBB16_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB16_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB16_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -2343,8 +2524,13 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB16_2
-; GCN1-NEXT:  .LBB16_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB16_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB16_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -2360,6 +2546,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB16_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_sub_i64_offset:
@@ -2376,16 +2563,11 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB16_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB16_4
-; GCN2-NEXT:  .LBB16_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB16_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB16_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -2393,8 +2575,13 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB16_2
-; GCN2-NEXT:  .LBB16_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB16_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB16_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2409,6 +2596,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB16_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_offset:
@@ -2419,23 +2607,24 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB16_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB16_4
-; GFX12-NEXT:  .LBB16_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB16_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB16_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_2
-; GFX12-NEXT:  .LBB16_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB16_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB16_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2444,6 +2633,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB16_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -2467,9 +2657,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB17_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB17_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -2478,11 +2668,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB17_3
-; GCN1-NEXT:    s_branch .LBB17_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB17_3
 ; GCN1-NEXT:  .LBB17_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB17_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB17_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB17_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -2498,7 +2694,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB17_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB17_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2519,9 +2715,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB17_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB17_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -2530,11 +2726,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB17_3
-; GCN2-NEXT:    s_branch .LBB17_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB17_3
 ; GCN2-NEXT:  .LBB17_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB17_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB17_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB17_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -2549,7 +2751,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB17_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB17_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2560,25 +2762,32 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB17_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB17_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB17_3
-; GFX12-NEXT:    s_branch .LBB17_4
+; GFX12-NEXT:    s_branch .LBB17_3
 ; GFX12-NEXT:  .LBB17_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB17_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB17_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB17_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -2587,7 +2796,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB17_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB17_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2617,16 +2826,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB18_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB18_4
-; GCN1-NEXT:  .LBB18_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB18_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB18_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -2634,8 +2838,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB18_2
-; GCN1-NEXT:  .LBB18_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB18_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB18_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -2651,36 +2860,33 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB18_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_sub_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB18_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB18_4
-; GCN2-NEXT:  .LBB18_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB18_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB18_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -2688,8 +2894,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB18_2
-; GCN2-NEXT:  .LBB18_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB18_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB18_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2704,6 +2915,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB18_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_addr64_offset:
@@ -2718,24 +2930,25 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB18_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB18_4
-; GFX12-NEXT:  .LBB18_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB18_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB18_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB18_2
-; GFX12-NEXT:  .LBB18_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB18_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB18_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2744,6 +2957,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB18_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -2770,9 +2984,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB19_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB19_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -2781,11 +2995,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB19_3
-; GCN1-NEXT:    s_branch .LBB19_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB19_3
 ; GCN1-NEXT:  .LBB19_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB19_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB19_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB19_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s9
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -2801,7 +3021,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB19_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB19_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2815,8 +3035,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -2824,9 +3044,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB19_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB19_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -2835,11 +3055,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB19_3
-; GCN2-NEXT:    s_branch .LBB19_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB19_3
 ; GCN2-NEXT:  .LBB19_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB19_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB19_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB19_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -2854,7 +3080,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB19_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB19_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -2870,21 +3096,28 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB19_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB19_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB19_3
-; GFX12-NEXT:    s_branch .LBB19_4
+; GFX12-NEXT:    s_branch .LBB19_3
 ; GFX12-NEXT:  .LBB19_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB19_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB19_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB19_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -2893,7 +3126,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s4
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB19_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB19_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -2918,16 +3151,11 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB20_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB20_4
-; GCN1-NEXT:  .LBB20_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB20_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB20_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -2935,8 +3163,13 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB20_2
-; GCN1-NEXT:  .LBB20_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB20_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB20_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -2952,6 +3185,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB20_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_sub_i64:
@@ -2966,16 +3200,11 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB20_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB20_4
-; GCN2-NEXT:  .LBB20_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB20_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB20_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -2983,8 +3212,13 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB20_2
-; GCN2-NEXT:  .LBB20_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB20_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB20_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -2999,6 +3233,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB20_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64:
@@ -3007,24 +3242,25 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB20_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB20_4
-; GFX12-NEXT:  .LBB20_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB20_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB20_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB20_2
-; GFX12-NEXT:  .LBB20_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB20_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB20_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -3032,6 +3268,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB20_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
@@ -3052,9 +3289,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB21_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB21_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -3063,11 +3300,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB21_3
-; GCN1-NEXT:    s_branch .LBB21_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB21_3
 ; GCN1-NEXT:  .LBB21_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB21_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB21_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB21_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -3083,7 +3326,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB21_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB21_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3102,9 +3345,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB21_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB21_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -3113,11 +3356,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB21_3
-; GCN2-NEXT:    s_branch .LBB21_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB21_3
 ; GCN2-NEXT:  .LBB21_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB21_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB21_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB21_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -3132,7 +3381,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB21_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB21_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3146,21 +3395,28 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB21_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB21_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB21_3
-; GFX12-NEXT:    s_branch .LBB21_4
+; GFX12-NEXT:    s_branch .LBB21_3
 ; GFX12-NEXT:  .LBB21_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB21_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB21_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB21_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -3168,7 +3424,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB21_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB21_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3195,16 +3451,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB22_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB22_4
-; GCN1-NEXT:  .LBB22_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB22_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB22_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -3212,8 +3463,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB22_2
-; GCN1-NEXT:  .LBB22_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB22_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB22_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -3229,6 +3485,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB22_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_sub_i64_addr64:
@@ -3247,16 +3504,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB22_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB22_4
-; GCN2-NEXT:  .LBB22_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB22_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB22_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -3264,8 +3516,13 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB22_2
-; GCN2-NEXT:  .LBB22_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB22_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB22_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -3280,6 +3537,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB22_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_sub_i64_addr64:
@@ -3287,30 +3545,31 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB22_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB22_4
-; GFX12-NEXT:  .LBB22_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB22_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB22_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB22_2
-; GFX12-NEXT:  .LBB22_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB22_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB22_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -3319,6 +3578,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s2
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB22_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -3342,9 +3602,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB23_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB23_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -3353,11 +3613,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB23_3
-; GCN1-NEXT:    s_branch .LBB23_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB23_3
 ; GCN1-NEXT:  .LBB23_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB23_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB23_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB23_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s9
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -3373,7 +3639,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB23_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB23_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3394,9 +3660,9 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB23_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB23_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -3405,11 +3671,17 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB23_3
-; GCN2-NEXT:    s_branch .LBB23_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB23_3
 ; GCN2-NEXT:  .LBB23_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB23_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB23_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB23_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -3424,7 +3696,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB23_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB23_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3433,27 +3705,34 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_sub_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB23_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB23_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB23_3
-; GFX12-NEXT:    s_branch .LBB23_4
+; GFX12-NEXT:    s_branch .LBB23_3
 ; GFX12-NEXT:  .LBB23_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB23_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB23_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB23_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -3462,7 +3741,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s4
 ; GFX12-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s5, v1, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB23_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB23_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3488,24 +3767,24 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB24_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB24_4
-; GCN1-NEXT:  .LBB24_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB24_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB24_2
-; GCN1-NEXT:  .LBB24_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB24_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB24_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -3522,6 +3801,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB24_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_max_i64_offset:
@@ -3538,24 +3818,24 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB24_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB24_4
-; GCN2-NEXT:  .LBB24_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB24_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB24_2
-; GCN2-NEXT:  .LBB24_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB24_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB24_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -3571,6 +3851,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB24_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_offset:
@@ -3581,23 +3862,24 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB24_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB24_4
-; GFX12-NEXT:  .LBB24_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB24_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB24_2
-; GFX12-NEXT:  .LBB24_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB24_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB24_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -3607,6 +3889,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB24_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -3630,9 +3913,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB25_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB25_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -3640,11 +3923,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB25_3
-; GCN1-NEXT:    s_branch .LBB25_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB25_3
 ; GCN1-NEXT:  .LBB25_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB25_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB25_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB25_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -3661,7 +3950,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB25_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB25_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -3683,9 +3972,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB25_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB25_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -3693,11 +3982,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB25_3
-; GCN2-NEXT:    s_branch .LBB25_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB25_3
 ; GCN2-NEXT:  .LBB25_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB25_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB25_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB25_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -3713,7 +4008,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB25_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB25_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -3725,25 +4020,32 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB25_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB25_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB25_3
-; GFX12-NEXT:    s_branch .LBB25_4
+; GFX12-NEXT:    s_branch .LBB25_3
 ; GFX12-NEXT:  .LBB25_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB25_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB25_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB25_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -3753,7 +4055,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB25_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB25_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -3783,24 +4085,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB26_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB26_4
-; GCN1-NEXT:  .LBB26_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB26_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB26_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB26_2
-; GCN1-NEXT:  .LBB26_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB26_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB26_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -3817,44 +4119,46 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB26_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_max_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB26_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB26_4
-; GCN2-NEXT:  .LBB26_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB26_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB26_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB26_2
-; GCN2-NEXT:  .LBB26_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB26_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB26_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -3870,6 +4174,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB26_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_addr64_offset:
@@ -3884,24 +4189,25 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB26_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB26_4
-; GFX12-NEXT:  .LBB26_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB26_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB26_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB26_2
-; GFX12-NEXT:  .LBB26_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB26_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB26_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -3911,6 +4217,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB26_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -3937,9 +4244,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB27_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB27_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -3947,11 +4254,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB27_3
-; GCN1-NEXT:    s_branch .LBB27_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB27_3
 ; GCN1-NEXT:  .LBB27_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB27_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB27_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB27_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -3968,7 +4281,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB27_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB27_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -3983,8 +4296,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -3992,9 +4305,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB27_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB27_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -4002,11 +4315,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB27_3
-; GCN2-NEXT:    s_branch .LBB27_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB27_3
 ; GCN2-NEXT:  .LBB27_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB27_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB27_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB27_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4022,7 +4341,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB27_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB27_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -4039,21 +4358,28 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB27_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB27_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB27_3
-; GFX12-NEXT:    s_branch .LBB27_4
+; GFX12-NEXT:    s_branch .LBB27_3
 ; GFX12-NEXT:  .LBB27_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB27_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB27_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB27_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -4063,7 +4389,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB27_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB27_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4088,24 +4414,24 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB28_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB28_4
-; GCN1-NEXT:  .LBB28_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB28_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB28_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB28_2
-; GCN1-NEXT:  .LBB28_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB28_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB28_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -4122,6 +4448,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB28_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_max_i64:
@@ -4136,24 +4463,24 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB28_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB28_4
-; GCN2-NEXT:  .LBB28_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB28_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB28_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB28_2
-; GCN2-NEXT:  .LBB28_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB28_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB28_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4169,6 +4496,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB28_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64:
@@ -4177,24 +4505,25 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB28_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB28_4
-; GFX12-NEXT:  .LBB28_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB28_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB28_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB28_2
-; GFX12-NEXT:  .LBB28_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB28_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB28_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -4203,6 +4532,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB28_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -4223,9 +4553,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB29_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB29_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -4233,11 +4563,17 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB29_3
-; GCN1-NEXT:    s_branch .LBB29_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB29_3
 ; GCN1-NEXT:  .LBB29_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB29_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB29_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB29_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -4254,7 +4590,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB29_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB29_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -4274,9 +4610,9 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB29_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB29_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -4284,11 +4620,17 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB29_3
-; GCN2-NEXT:    s_branch .LBB29_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB29_3
 ; GCN2-NEXT:  .LBB29_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB29_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB29_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB29_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -4304,7 +4646,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB29_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB29_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -4319,21 +4661,28 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB29_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB29_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB29_3
-; GFX12-NEXT:    s_branch .LBB29_4
+; GFX12-NEXT:    s_branch .LBB29_3
 ; GFX12-NEXT:  .LBB29_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB29_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB29_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB29_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -4342,7 +4691,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB29_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB29_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4369,24 +4718,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB30_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB30_4
-; GCN1-NEXT:  .LBB30_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB30_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB30_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB30_2
-; GCN1-NEXT:  .LBB30_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB30_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB30_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -4403,6 +4752,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB30_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_max_i64_addr64:
@@ -4421,24 +4771,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB30_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB30_4
-; GCN2-NEXT:  .LBB30_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB30_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB30_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB30_2
-; GCN2-NEXT:  .LBB30_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB30_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB30_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4454,6 +4804,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB30_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_max_i64_addr64:
@@ -4461,30 +4812,31 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB30_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB30_4
-; GFX12-NEXT:  .LBB30_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB30_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB30_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB30_2
-; GFX12-NEXT:  .LBB30_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB30_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB30_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -4494,6 +4846,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB30_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -4517,9 +4870,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB31_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB31_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -4527,11 +4880,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB31_3
-; GCN1-NEXT:    s_branch .LBB31_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB31_3
 ; GCN1-NEXT:  .LBB31_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB31_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB31_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB31_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -4548,7 +4907,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB31_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB31_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -4570,9 +4929,9 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB31_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB31_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -4580,11 +4939,17 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB31_3
-; GCN2-NEXT:    s_branch .LBB31_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB31_3
 ; GCN2-NEXT:  .LBB31_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB31_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB31_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB31_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4600,7 +4965,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB31_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB31_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -4610,27 +4975,34 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_max_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB31_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB31_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB31_3
-; GFX12-NEXT:    s_branch .LBB31_4
+; GFX12-NEXT:    s_branch .LBB31_3
 ; GFX12-NEXT:  .LBB31_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB31_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB31_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB31_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -4640,7 +5012,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB31_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB31_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4666,24 +5038,24 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB32_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB32_4
-; GCN1-NEXT:  .LBB32_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB32_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB32_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB32_2
-; GCN1-NEXT:  .LBB32_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB32_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB32_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -4700,6 +5072,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB32_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umax_i64_offset:
@@ -4716,24 +5089,24 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB32_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB32_4
-; GCN2-NEXT:  .LBB32_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB32_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB32_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB32_2
-; GCN2-NEXT:  .LBB32_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB32_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB32_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -4749,6 +5122,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB32_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_offset:
@@ -4759,23 +5133,24 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB32_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB32_4
-; GFX12-NEXT:  .LBB32_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB32_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB32_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB32_2
-; GFX12-NEXT:  .LBB32_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB32_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB32_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -4785,6 +5160,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB32_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -4808,9 +5184,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB33_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB33_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -4818,11 +5194,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB33_3
-; GCN1-NEXT:    s_branch .LBB33_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB33_3
 ; GCN1-NEXT:  .LBB33_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB33_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB33_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB33_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -4839,7 +5221,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB33_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB33_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -4861,9 +5243,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB33_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB33_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -4871,11 +5253,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB33_3
-; GCN2-NEXT:    s_branch .LBB33_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB33_3
 ; GCN2-NEXT:  .LBB33_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB33_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB33_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB33_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -4891,7 +5279,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB33_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB33_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -4903,25 +5291,32 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB33_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB33_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB33_3
-; GFX12-NEXT:    s_branch .LBB33_4
+; GFX12-NEXT:    s_branch .LBB33_3
 ; GFX12-NEXT:  .LBB33_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB33_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB33_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB33_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -4931,7 +5326,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB33_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB33_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -4961,24 +5356,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB34_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB34_4
-; GCN1-NEXT:  .LBB34_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB34_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB34_2
-; GCN1-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB34_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB34_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -4995,44 +5390,46 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB34_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umax_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB34_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB34_4
-; GCN2-NEXT:  .LBB34_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB34_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB34_2
-; GCN2-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB34_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB34_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -5048,6 +5445,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB34_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_addr64_offset:
@@ -5062,24 +5460,25 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB34_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB34_4
-; GFX12-NEXT:  .LBB34_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB34_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB34_2
-; GFX12-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB34_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB34_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -5089,6 +5488,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB34_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -5115,9 +5515,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB35_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -5125,11 +5525,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB35_3
-; GCN1-NEXT:    s_branch .LBB35_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB35_3
 ; GCN1-NEXT:  .LBB35_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB35_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB35_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB35_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -5146,7 +5552,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB35_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB35_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -5161,8 +5567,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -5170,9 +5576,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB35_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -5180,11 +5586,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB35_3
-; GCN2-NEXT:    s_branch .LBB35_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB35_3
 ; GCN2-NEXT:  .LBB35_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB35_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB35_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB35_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -5200,7 +5612,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB35_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB35_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -5217,21 +5629,28 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB35_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB35_3
-; GFX12-NEXT:    s_branch .LBB35_4
+; GFX12-NEXT:    s_branch .LBB35_3
 ; GFX12-NEXT:  .LBB35_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB35_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB35_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB35_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -5241,7 +5660,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB35_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB35_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5266,24 +5685,24 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB36_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB36_4
-; GCN1-NEXT:  .LBB36_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB36_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB36_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB36_2
-; GCN1-NEXT:  .LBB36_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB36_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB36_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -5300,6 +5719,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB36_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umax_i64:
@@ -5314,24 +5734,24 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB36_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB36_4
-; GCN2-NEXT:  .LBB36_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB36_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB36_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB36_2
-; GCN2-NEXT:  .LBB36_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB36_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB36_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -5347,6 +5767,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB36_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64:
@@ -5355,24 +5776,25 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB36_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB36_4
-; GFX12-NEXT:  .LBB36_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB36_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB36_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB36_2
-; GFX12-NEXT:  .LBB36_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB36_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB36_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -5381,6 +5803,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB36_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -5401,9 +5824,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB37_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB37_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -5411,11 +5834,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB37_3
-; GCN1-NEXT:    s_branch .LBB37_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB37_3
 ; GCN1-NEXT:  .LBB37_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB37_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB37_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB37_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -5432,7 +5861,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB37_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB37_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -5452,9 +5881,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB37_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB37_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -5462,11 +5891,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB37_3
-; GCN2-NEXT:    s_branch .LBB37_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB37_3
 ; GCN2-NEXT:  .LBB37_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB37_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB37_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB37_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -5482,7 +5917,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB37_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB37_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -5497,21 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB37_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB37_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB37_3
-; GFX12-NEXT:    s_branch .LBB37_4
+; GFX12-NEXT:    s_branch .LBB37_3
 ; GFX12-NEXT:  .LBB37_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB37_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB37_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB37_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -5520,7 +5962,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB37_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB37_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5547,24 +5989,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB38_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB38_4
-; GCN1-NEXT:  .LBB38_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB38_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB38_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB38_2
-; GCN1-NEXT:  .LBB38_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB38_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB38_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -5581,6 +6023,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB38_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umax_i64_addr64:
@@ -5599,24 +6042,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB38_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB38_4
-; GCN2-NEXT:  .LBB38_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB38_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB38_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB38_2
-; GCN2-NEXT:  .LBB38_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB38_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB38_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -5632,6 +6075,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB38_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umax_i64_addr64:
@@ -5639,30 +6083,31 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB38_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB38_4
-; GFX12-NEXT:  .LBB38_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB38_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB38_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB38_2
-; GFX12-NEXT:  .LBB38_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB38_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB38_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -5672,6 +6117,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB38_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -5695,9 +6141,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB39_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB39_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -5705,11 +6151,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB39_3
-; GCN1-NEXT:    s_branch .LBB39_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB39_3
 ; GCN1-NEXT:  .LBB39_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB39_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB39_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB39_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -5726,7 +6178,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB39_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB39_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -5748,9 +6200,9 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB39_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB39_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -5758,11 +6210,17 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB39_3
-; GCN2-NEXT:    s_branch .LBB39_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB39_3
 ; GCN2-NEXT:  .LBB39_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB39_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB39_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB39_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -5778,7 +6236,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB39_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB39_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -5788,27 +6246,34 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-LABEL: atomic_umax_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB39_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB39_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB39_3
-; GFX12-NEXT:    s_branch .LBB39_4
+; GFX12-NEXT:    s_branch .LBB39_3
 ; GFX12-NEXT:  .LBB39_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB39_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB39_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB39_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -5818,7 +6283,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB39_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB39_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -5844,24 +6309,24 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB40_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB40_4
-; GCN1-NEXT:  .LBB40_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB40_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB40_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB40_2
-; GCN1-NEXT:  .LBB40_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB40_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB40_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -5878,6 +6343,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB40_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i64_offset:
@@ -5894,24 +6360,24 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB40_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB40_4
-; GCN2-NEXT:  .LBB40_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB40_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB40_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB40_2
-; GCN2-NEXT:  .LBB40_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB40_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB40_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -5927,6 +6393,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB40_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_offset:
@@ -5937,23 +6404,24 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB40_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB40_4
-; GFX12-NEXT:  .LBB40_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB40_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB40_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB40_2
-; GFX12-NEXT:  .LBB40_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB40_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB40_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -5963,6 +6431,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB40_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -5986,9 +6455,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB41_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB41_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -5996,11 +6465,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB41_3
-; GCN1-NEXT:    s_branch .LBB41_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB41_3
 ; GCN1-NEXT:  .LBB41_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB41_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB41_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB41_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -6017,7 +6492,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB41_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB41_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -6039,9 +6514,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB41_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB41_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -6049,11 +6524,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB41_3
-; GCN2-NEXT:    s_branch .LBB41_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB41_3
 ; GCN2-NEXT:  .LBB41_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB41_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB41_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB41_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -6069,7 +6550,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB41_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB41_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -6081,25 +6562,32 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB41_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB41_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB41_3
-; GFX12-NEXT:    s_branch .LBB41_4
+; GFX12-NEXT:    s_branch .LBB41_3
 ; GFX12-NEXT:  .LBB41_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB41_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB41_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB41_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -6109,7 +6597,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB41_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB41_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6139,24 +6627,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB42_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB42_4
-; GCN1-NEXT:  .LBB42_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB42_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB42_2
-; GCN1-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB42_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB42_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -6173,44 +6661,46 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB42_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB42_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB42_4
-; GCN2-NEXT:  .LBB42_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB42_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB42_2
-; GCN2-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB42_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB42_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -6226,6 +6716,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB42_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_addr64_offset:
@@ -6240,24 +6731,25 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB42_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB42_4
-; GFX12-NEXT:  .LBB42_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB42_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB42_2
-; GFX12-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB42_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB42_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -6267,6 +6759,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB42_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6293,9 +6786,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB43_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB43_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -6303,11 +6796,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB43_3
-; GCN1-NEXT:    s_branch .LBB43_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB43_3
 ; GCN1-NEXT:  .LBB43_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB43_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB43_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB43_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -6324,7 +6823,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB43_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB43_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -6339,8 +6838,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -6348,9 +6847,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB43_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB43_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -6358,11 +6857,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB43_3
-; GCN2-NEXT:    s_branch .LBB43_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB43_3
 ; GCN2-NEXT:  .LBB43_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB43_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB43_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB43_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -6378,7 +6883,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB43_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB43_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -6395,21 +6900,28 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB43_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB43_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB43_3
-; GFX12-NEXT:    s_branch .LBB43_4
+; GFX12-NEXT:    s_branch .LBB43_3
 ; GFX12-NEXT:  .LBB43_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB43_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB43_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB43_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -6419,7 +6931,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB43_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB43_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6444,24 +6956,24 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB44_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB44_4
-; GCN1-NEXT:  .LBB44_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB44_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB44_2
-; GCN1-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB44_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB44_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -6478,6 +6990,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB44_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i64:
@@ -6492,24 +7005,24 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB44_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB44_4
-; GCN2-NEXT:  .LBB44_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB44_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB44_2
-; GCN2-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB44_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB44_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -6525,6 +7038,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB44_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64:
@@ -6533,24 +7047,25 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB44_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB44_4
-; GFX12-NEXT:  .LBB44_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB44_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB44_2
-; GFX12-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB44_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB44_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -6559,6 +7074,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB44_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -6579,9 +7095,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB45_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB45_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -6589,11 +7105,17 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB45_3
-; GCN1-NEXT:    s_branch .LBB45_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB45_3
 ; GCN1-NEXT:  .LBB45_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB45_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB45_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB45_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -6610,7 +7132,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB45_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB45_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -6630,9 +7152,9 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB45_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB45_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -6640,11 +7162,17 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB45_3
-; GCN2-NEXT:    s_branch .LBB45_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB45_3
 ; GCN2-NEXT:  .LBB45_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB45_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB45_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB45_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -6660,7 +7188,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB45_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB45_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -6675,21 +7203,28 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB45_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB45_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB45_3
-; GFX12-NEXT:    s_branch .LBB45_4
+; GFX12-NEXT:    s_branch .LBB45_3
 ; GFX12-NEXT:  .LBB45_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB45_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB45_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB45_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -6698,7 +7233,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB45_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB45_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -6725,24 +7260,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB46_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB46_4
-; GCN1-NEXT:  .LBB46_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB46_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB46_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB46_2
-; GCN1-NEXT:  .LBB46_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB46_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB46_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -6759,6 +7294,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB46_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_min_i64_addr64:
@@ -6777,24 +7313,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB46_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB46_4
-; GCN2-NEXT:  .LBB46_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB46_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB46_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB46_2
-; GCN2-NEXT:  .LBB46_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB46_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB46_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -6810,6 +7346,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB46_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_min_i64_addr64:
@@ -6817,30 +7354,31 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB46_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB46_4
-; GFX12-NEXT:  .LBB46_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB46_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB46_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB46_2
-; GFX12-NEXT:  .LBB46_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB46_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB46_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -6850,6 +7388,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB46_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -6873,9 +7412,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB47_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB47_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -6883,11 +7422,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB47_3
-; GCN1-NEXT:    s_branch .LBB47_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB47_3
 ; GCN1-NEXT:  .LBB47_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB47_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB47_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB47_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -6904,7 +7449,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB47_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB47_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -6926,9 +7471,9 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB47_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB47_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -6936,11 +7481,17 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB47_3
-; GCN2-NEXT:    s_branch .LBB47_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB47_3
 ; GCN2-NEXT:  .LBB47_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB47_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB47_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB47_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -6956,7 +7507,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB47_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB47_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -6966,27 +7517,34 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_min_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB47_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB47_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB47_3
-; GFX12-NEXT:    s_branch .LBB47_4
+; GFX12-NEXT:    s_branch .LBB47_3
 ; GFX12-NEXT:  .LBB47_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB47_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB47_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB47_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -6996,7 +7554,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB47_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB47_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -7022,24 +7580,24 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB48_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB48_4
-; GCN1-NEXT:  .LBB48_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB48_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB48_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB48_2
-; GCN1-NEXT:  .LBB48_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB48_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB48_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -7056,6 +7614,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB48_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umin_i64_offset:
@@ -7072,24 +7631,24 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB48_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB48_4
-; GCN2-NEXT:  .LBB48_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB48_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB48_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB48_2
-; GCN2-NEXT:  .LBB48_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB48_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB48_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -7105,6 +7664,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB48_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_offset:
@@ -7115,23 +7675,24 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB48_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB48_4
-; GFX12-NEXT:  .LBB48_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB48_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB48_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB48_2
-; GFX12-NEXT:  .LBB48_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB48_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB48_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -7141,6 +7702,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB48_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -7164,9 +7726,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB49_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB49_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -7174,11 +7736,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB49_3
-; GCN1-NEXT:    s_branch .LBB49_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB49_3
 ; GCN1-NEXT:  .LBB49_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB49_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB49_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB49_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -7195,7 +7763,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB49_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB49_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -7217,9 +7785,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB49_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB49_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -7227,11 +7795,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB49_3
-; GCN2-NEXT:    s_branch .LBB49_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB49_3
 ; GCN2-NEXT:  .LBB49_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB49_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB49_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB49_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -7247,7 +7821,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB49_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB49_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -7259,25 +7833,32 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB49_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB49_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB49_3
-; GFX12-NEXT:    s_branch .LBB49_4
+; GFX12-NEXT:    s_branch .LBB49_3
 ; GFX12-NEXT:  .LBB49_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB49_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB49_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB49_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -7287,7 +7868,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB49_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB49_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -7317,24 +7898,24 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB50_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB50_4
-; GCN1-NEXT:  .LBB50_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB50_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB50_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB50_2
-; GCN1-NEXT:  .LBB50_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB50_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB50_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -7351,44 +7932,46 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB50_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umin_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB50_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB50_4
-; GCN2-NEXT:  .LBB50_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB50_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB50_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB50_2
-; GCN2-NEXT:  .LBB50_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB50_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB50_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -7404,6 +7987,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB50_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_addr64_offset:
@@ -7418,24 +8002,25 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB50_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB50_4
-; GFX12-NEXT:  .LBB50_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB50_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB50_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB50_2
-; GFX12-NEXT:  .LBB50_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB50_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB50_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -7445,6 +8030,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB50_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -7471,9 +8057,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB51_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB51_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -7481,11 +8067,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB51_3
-; GCN1-NEXT:    s_branch .LBB51_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB51_3
 ; GCN1-NEXT:  .LBB51_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB51_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB51_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB51_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -7502,7 +8094,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB51_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB51_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -7517,8 +8109,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -7526,9 +8118,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB51_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB51_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -7536,11 +8128,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB51_3
-; GCN2-NEXT:    s_branch .LBB51_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB51_3
 ; GCN2-NEXT:  .LBB51_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB51_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB51_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB51_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -7556,7 +8154,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB51_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB51_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -7573,21 +8171,28 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB51_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB51_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB51_3
-; GFX12-NEXT:    s_branch .LBB51_4
+; GFX12-NEXT:    s_branch .LBB51_3
 ; GFX12-NEXT:  .LBB51_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB51_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB51_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB51_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -7597,7 +8202,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB51_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB51_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -7622,24 +8227,24 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB52_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB52_4
-; GCN1-NEXT:  .LBB52_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB52_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB52_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB52_2
-; GCN1-NEXT:  .LBB52_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB52_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB52_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -7656,6 +8261,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB52_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umin_i64:
@@ -7670,24 +8276,24 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB52_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB52_4
-; GCN2-NEXT:  .LBB52_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB52_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB52_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB52_2
-; GCN2-NEXT:  .LBB52_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB52_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB52_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -7703,6 +8309,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB52_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64:
@@ -7711,24 +8318,25 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB52_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB52_4
-; GFX12-NEXT:  .LBB52_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB52_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB52_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB52_2
-; GFX12-NEXT:  .LBB52_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB52_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB52_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -7737,6 +8345,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB52_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
@@ -7757,9 +8366,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB53_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB53_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -7767,11 +8376,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB53_3
-; GCN1-NEXT:    s_branch .LBB53_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB53_3
 ; GCN1-NEXT:  .LBB53_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB53_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB53_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB53_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -7788,7 +8403,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB53_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB53_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -7808,9 +8423,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB53_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB53_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -7818,11 +8433,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB53_3
-; GCN2-NEXT:    s_branch .LBB53_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB53_3
 ; GCN2-NEXT:  .LBB53_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB53_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB53_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB53_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -7838,7 +8459,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB53_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB53_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -7853,21 +8474,28 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB53_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB53_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB53_3
-; GFX12-NEXT:    s_branch .LBB53_4
+; GFX12-NEXT:    s_branch .LBB53_3
 ; GFX12-NEXT:  .LBB53_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB53_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB53_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB53_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -7876,7 +8504,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s1, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s0, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB53_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB53_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -7903,24 +8531,24 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB54_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB54_4
-; GCN1-NEXT:  .LBB54_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB54_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB54_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execnz .LBB54_2
-; GCN1-NEXT:  .LBB54_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB54_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB54_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -7937,6 +8565,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB54_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_umin_i64_addr64:
@@ -7955,24 +8584,24 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB54_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB54_4
-; GCN2-NEXT:  .LBB54_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB54_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB54_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execnz .LBB54_2
-; GCN2-NEXT:  .LBB54_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB54_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB54_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -7988,6 +8617,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB54_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_umin_i64_addr64:
@@ -7995,30 +8625,31 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB54_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB54_4
-; GFX12-NEXT:  .LBB54_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB54_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB54_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execnz .LBB54_2
-; GFX12-NEXT:  .LBB54_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB54_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB54_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8028,6 +8659,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB54_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -8051,9 +8683,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB55_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB55_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -8061,11 +8693,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN1-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN1-NEXT:    s_cbranch_execz .LBB55_3
-; GCN1-NEXT:    s_branch .LBB55_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB55_3
 ; GCN1-NEXT:  .LBB55_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB55_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB55_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB55_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -8082,7 +8720,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB55_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB55_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
@@ -8104,9 +8742,9 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB55_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB55_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -8114,11 +8752,17 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN2-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN2-NEXT:    s_cbranch_execz .LBB55_3
-; GCN2-NEXT:    s_branch .LBB55_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB55_3
 ; GCN2-NEXT:  .LBB55_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB55_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB55_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB55_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -8134,7 +8778,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB55_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB55_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
@@ -8144,27 +8788,34 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-LABEL: atomic_umin_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB55_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB55_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_SE
-; GFX12-NEXT:    s_cbranch_execz .LBB55_3
-; GFX12-NEXT:    s_branch .LBB55_4
+; GFX12-NEXT:    s_branch .LBB55_3
 ; GFX12-NEXT:  .LBB55_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB55_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB55_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB55_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8174,7 +8825,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e32 v3, s5, v1, vcc_lo
 ; GFX12-NEXT:    v_cndmask_b32_e32 v2, s4, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB55_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB55_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -8200,16 +8851,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB56_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB56_4
-; GCN1-NEXT:  .LBB56_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB56_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB56_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -8217,8 +8863,13 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB56_2
-; GCN1-NEXT:  .LBB56_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB56_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB56_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8233,6 +8884,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB56_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_or_i64_offset:
@@ -8249,16 +8901,11 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB56_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB56_4
-; GCN2-NEXT:  .LBB56_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB56_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB56_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -8266,8 +8913,13 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB56_2
-; GCN2-NEXT:  .LBB56_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB56_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB56_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -8281,6 +8933,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB56_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_offset:
@@ -8291,23 +8944,24 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB56_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB56_4
-; GFX12-NEXT:  .LBB56_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB56_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB56_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB56_2
-; GFX12-NEXT:  .LBB56_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB56_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB56_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8316,6 +8970,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB56_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -8339,9 +8994,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB57_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB57_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -8350,11 +9005,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB57_3
-; GCN1-NEXT:    s_branch .LBB57_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB57_3
 ; GCN1-NEXT:  .LBB57_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB57_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB57_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB57_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s2, -1
@@ -8369,7 +9030,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN1-NEXT:    v_or_b32_e32 v5, s1, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB57_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB57_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -8390,9 +9051,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB57_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB57_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -8401,11 +9062,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB57_3
-; GCN2-NEXT:    s_branch .LBB57_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB57_3
 ; GCN2-NEXT:  .LBB57_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB57_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB57_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB57_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -8419,7 +9086,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    v_or_b32_e32 v5, s1, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB57_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB57_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -8430,25 +9097,32 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB57_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB57_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB57_3
-; GFX12-NEXT:    s_branch .LBB57_4
+; GFX12-NEXT:    s_branch .LBB57_3
 ; GFX12-NEXT:  .LBB57_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB57_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB57_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB57_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -8457,7 +9131,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
 ; GFX12-NEXT:    v_or_b32_e32 v3, s1, v1
 ; GFX12-NEXT:    v_or_b32_e32 v2, s0, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB57_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB57_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -8487,16 +9161,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB58_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB58_4
-; GCN1-NEXT:  .LBB58_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB58_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB58_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -8504,8 +9173,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB58_2
-; GCN1-NEXT:  .LBB58_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB58_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB58_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8520,36 +9194,33 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GCN1-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB58_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_or_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB58_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB58_4
-; GCN2-NEXT:  .LBB58_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB58_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB58_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -8557,8 +9228,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB58_2
-; GCN2-NEXT:  .LBB58_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB58_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB58_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -8572,6 +9248,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GCN2-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB58_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_addr64_offset:
@@ -8586,24 +9263,25 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB58_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB58_4
-; GFX12-NEXT:  .LBB58_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB58_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB58_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB58_2
-; GFX12-NEXT:  .LBB58_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB58_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB58_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8612,6 +9290,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
 ; GFX12-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB58_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -8638,9 +9317,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB59_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB59_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -8649,11 +9328,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB59_3
-; GCN1-NEXT:    s_branch .LBB59_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB59_3
 ; GCN1-NEXT:  .LBB59_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB59_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB59_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB59_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8668,7 +9353,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_or_b32_e32 v5, s9, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB59_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB59_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -8682,8 +9367,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -8691,9 +9376,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB59_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB59_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -8702,11 +9387,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB59_3
-; GCN2-NEXT:    s_branch .LBB59_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB59_3
 ; GCN2-NEXT:  .LBB59_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB59_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB59_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB59_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -8720,7 +9411,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_or_b32_e32 v5, s9, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB59_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB59_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -8736,21 +9427,28 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB59_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB59_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB59_3
-; GFX12-NEXT:    s_branch .LBB59_4
+; GFX12-NEXT:    s_branch .LBB59_3
 ; GFX12-NEXT:  .LBB59_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB59_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB59_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB59_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -8759,7 +9457,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_or_b32_e32 v3, s5, v1
 ; GFX12-NEXT:    v_or_b32_e32 v2, s4, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB59_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB59_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -8784,16 +9482,11 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB60_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB60_4
-; GCN1-NEXT:  .LBB60_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB60_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB60_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -8801,8 +9494,13 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB60_2
-; GCN1-NEXT:  .LBB60_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB60_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB60_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s4, -1
@@ -8817,6 +9515,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB60_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_or_i64:
@@ -8831,16 +9530,11 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB60_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB60_4
-; GCN2-NEXT:  .LBB60_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB60_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB60_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -8848,8 +9542,13 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB60_2
-; GCN2-NEXT:  .LBB60_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB60_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB60_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -8863,6 +9562,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB60_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64:
@@ -8871,24 +9571,25 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB60_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB60_4
-; GFX12-NEXT:  .LBB60_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB60_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB60_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB60_2
-; GFX12-NEXT:  .LBB60_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB60_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB60_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -8896,6 +9597,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB60_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
@@ -8916,9 +9618,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB61_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB61_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -8927,11 +9629,17 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB61_3
-; GCN1-NEXT:    s_branch .LBB61_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB61_3
 ; GCN1-NEXT:  .LBB61_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB61_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB61_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB61_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s4, -1
@@ -8946,7 +9654,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_or_b32_e32 v5, s1, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB61_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB61_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -8965,9 +9673,9 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB61_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB61_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -8976,11 +9684,17 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB61_3
-; GCN2-NEXT:    s_branch .LBB61_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB61_3
 ; GCN2-NEXT:  .LBB61_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB61_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB61_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB61_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -8994,7 +9708,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_or_b32_e32 v5, s1, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB61_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB61_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -9008,21 +9722,28 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB61_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB61_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB61_3
-; GFX12-NEXT:    s_branch .LBB61_4
+; GFX12-NEXT:    s_branch .LBB61_3
 ; GFX12-NEXT:  .LBB61_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB61_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB61_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB61_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -9030,7 +9751,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_or_b32_e32 v3, s1, v1
 ; GFX12-NEXT:    v_or_b32_e32 v2, s0, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB61_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB61_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -9057,16 +9778,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB62_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB62_4
-; GCN1-NEXT:  .LBB62_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB62_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB62_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -9074,8 +9790,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB62_2
-; GCN1-NEXT:  .LBB62_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB62_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB62_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -9090,6 +9811,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN1-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB62_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_or_i64_addr64:
@@ -9108,16 +9830,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB62_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB62_4
-; GCN2-NEXT:  .LBB62_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB62_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB62_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -9125,8 +9842,13 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB62_2
-; GCN2-NEXT:  .LBB62_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB62_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB62_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -9140,6 +9862,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GCN2-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB62_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_or_i64_addr64:
@@ -9147,30 +9870,31 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB62_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB62_4
-; GFX12-NEXT:  .LBB62_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB62_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB62_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB62_2
-; GFX12-NEXT:  .LBB62_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB62_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB62_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -9179,6 +9903,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
 ; GFX12-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_or_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB62_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -9202,9 +9927,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB63_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB63_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -9213,11 +9938,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB63_3
-; GCN1-NEXT:    s_branch .LBB63_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB63_3
 ; GCN1-NEXT:  .LBB63_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB63_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB63_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB63_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -9232,7 +9963,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN1-NEXT:    v_or_b32_e32 v5, s9, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB63_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB63_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -9253,9 +9984,9 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB63_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB63_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -9264,11 +9995,17 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB63_3
-; GCN2-NEXT:    s_branch .LBB63_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB63_3
 ; GCN2-NEXT:  .LBB63_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB63_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB63_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB63_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -9282,7 +10019,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GCN2-NEXT:    v_or_b32_e32 v5, s9, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB63_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB63_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -9291,27 +10028,34 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GFX12-LABEL: atomic_or_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB63_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB63_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB63_3
-; GFX12-NEXT:    s_branch .LBB63_4
+; GFX12-NEXT:    s_branch .LBB63_3
 ; GFX12-NEXT:  .LBB63_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB63_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB63_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB63_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -9320,7 +10064,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
 ; GFX12-NEXT:    v_or_b32_e32 v3, s5, v1
 ; GFX12-NEXT:    v_or_b32_e32 v2, s4, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB63_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB63_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -9346,16 +10090,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB64_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB64_4
-; GCN1-NEXT:  .LBB64_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB64_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB64_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -9363,8 +10102,13 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB64_2
-; GCN1-NEXT:  .LBB64_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB64_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB64_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -9375,6 +10119,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB64_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xchg_i64_offset:
@@ -9391,16 +10136,11 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB64_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB64_4
-; GCN2-NEXT:  .LBB64_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB64_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB64_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -9408,8 +10148,13 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB64_2
-; GCN2-NEXT:  .LBB64_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB64_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB64_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -9419,6 +10164,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB64_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_offset:
@@ -9429,28 +10175,30 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB64_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB64_4
-; GFX12-NEXT:  .LBB64_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB64_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB64_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB64_2
-; GFX12-NEXT:  .LBB64_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB64_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB64_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB64_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -9473,16 +10221,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB65_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB65_4
-; GCN1-NEXT:  .LBB65_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB65_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB65_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -9490,8 +10233,13 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB65_2
-; GCN1-NEXT:  .LBB65_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB65_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB65_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -9502,6 +10250,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB65_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xchg_f64_offset:
@@ -9518,16 +10267,11 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB65_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB65_4
-; GCN2-NEXT:  .LBB65_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB65_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB65_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -9535,8 +10279,13 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB65_2
-; GCN2-NEXT:  .LBB65_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB65_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB65_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -9546,6 +10295,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB65_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_f64_offset:
@@ -9556,28 +10306,30 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB65_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB65_4
-; GFX12-NEXT:  .LBB65_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB65_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB65_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB65_2
-; GFX12-NEXT:  .LBB65_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB65_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB65_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB65_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr double, ptr %out, i64 4
@@ -9600,16 +10352,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB66_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB66_4
-; GCN1-NEXT:  .LBB66_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB66_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB66_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -9617,8 +10364,13 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB66_2
-; GCN1-NEXT:  .LBB66_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB66_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB66_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -9629,6 +10381,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB66_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xchg_pointer_offset:
@@ -9645,16 +10398,11 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB66_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB66_4
-; GCN2-NEXT:  .LBB66_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB66_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB66_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -9662,8 +10410,13 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB66_2
-; GCN2-NEXT:  .LBB66_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB66_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB66_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -9673,6 +10426,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB66_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_pointer_offset:
@@ -9683,28 +10437,30 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB66_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB66_4
-; GFX12-NEXT:  .LBB66_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB66_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB66_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB66_2
-; GFX12-NEXT:  .LBB66_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB66_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB66_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB66_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr ptr, ptr %out, i32 4
@@ -9728,9 +10484,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB67_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB67_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -9739,11 +10495,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB67_3
-; GCN1-NEXT:    s_branch .LBB67_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB67_3
 ; GCN1-NEXT:  .LBB67_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB67_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB67_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB67_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -9756,7 +10518,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB67_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB67_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(2)
@@ -9778,9 +10540,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB67_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB67_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -9789,11 +10551,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB67_3
-; GCN2-NEXT:    s_branch .LBB67_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB67_3
 ; GCN2-NEXT:  .LBB67_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB67_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB67_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB67_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -9805,7 +10573,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB67_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB67_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(2)
@@ -9817,32 +10585,39 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB67_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB67_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB67_3
-; GFX12-NEXT:    s_branch .LBB67_4
+; GFX12-NEXT:    s_branch .LBB67_3
 ; GFX12-NEXT:  .LBB67_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB67_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB67_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB67_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB67_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB67_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -9873,16 +10648,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB68_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB68_4
-; GCN1-NEXT:  .LBB68_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB68_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -9890,8 +10660,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB68_2
-; GCN1-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB68_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB68_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -9902,36 +10677,33 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB68_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xchg_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB68_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB68_4
-; GCN2-NEXT:  .LBB68_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB68_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -9939,8 +10711,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB68_2
-; GCN2-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB68_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB68_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -9950,6 +10727,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB68_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
@@ -9964,29 +10742,31 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB68_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB68_4
-; GFX12-NEXT:  .LBB68_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB68_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB68_2
-; GFX12-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB68_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB68_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB68_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -10013,9 +10793,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB69_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB69_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -10024,11 +10804,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB69_3
-; GCN1-NEXT:    s_branch .LBB69_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB69_3
 ; GCN1-NEXT:  .LBB69_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB69_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB69_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB69_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -10041,7 +10827,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB69_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB69_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(2)
@@ -10056,8 +10842,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -10065,9 +10851,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB69_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB69_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -10076,11 +10862,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB69_3
-; GCN2-NEXT:    s_branch .LBB69_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB69_3
 ; GCN2-NEXT:  .LBB69_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB69_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB69_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB69_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -10092,7 +10884,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB69_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB69_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(2)
@@ -10109,28 +10901,35 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB69_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB69_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB69_3
-; GFX12-NEXT:    s_branch .LBB69_4
+; GFX12-NEXT:    s_branch .LBB69_3
 ; GFX12-NEXT:  .LBB69_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB69_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB69_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB69_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB69_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB69_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -10156,16 +10955,11 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB70_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB70_4
-; GCN1-NEXT:  .LBB70_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB70_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB70_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -10173,8 +10967,13 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB70_2
-; GCN1-NEXT:  .LBB70_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB70_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB70_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -10185,6 +10984,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB70_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xchg_i64:
@@ -10199,16 +10999,11 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB70_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB70_4
-; GCN2-NEXT:  .LBB70_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB70_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB70_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -10216,8 +11011,13 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB70_2
-; GCN2-NEXT:  .LBB70_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB70_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB70_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -10227,6 +11027,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB70_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64:
@@ -10235,28 +11036,30 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB70_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB70_4
-; GFX12-NEXT:  .LBB70_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB70_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB70_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB70_2
-; GFX12-NEXT:  .LBB70_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB70_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB70_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB70_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst
@@ -10277,9 +11080,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB71_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB71_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -10288,11 +11091,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB71_3
-; GCN1-NEXT:    s_branch .LBB71_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB71_3
 ; GCN1-NEXT:  .LBB71_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB71_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB71_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB71_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -10305,7 +11114,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB71_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB71_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(2)
@@ -10325,9 +11134,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB71_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB71_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -10336,11 +11145,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB71_3
-; GCN2-NEXT:    s_branch .LBB71_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB71_3
 ; GCN2-NEXT:  .LBB71_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB71_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB71_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB71_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -10352,7 +11167,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB71_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB71_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(2)
@@ -10367,27 +11182,34 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB71_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB71_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB71_3
-; GFX12-NEXT:    s_branch .LBB71_4
+; GFX12-NEXT:    s_branch .LBB71_3
 ; GFX12-NEXT:  .LBB71_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB71_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB71_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB71_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB71_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB71_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -10415,16 +11237,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB72_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB72_4
-; GCN1-NEXT:  .LBB72_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB72_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB72_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -10432,8 +11249,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB72_2
-; GCN1-NEXT:  .LBB72_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB72_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB72_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -10444,6 +11266,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB72_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xchg_i64_addr64:
@@ -10462,16 +11285,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB72_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB72_4
-; GCN2-NEXT:  .LBB72_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB72_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB72_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -10479,8 +11297,13 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB72_2
-; GCN2-NEXT:  .LBB72_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB72_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB72_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -10490,6 +11313,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB72_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xchg_i64_addr64:
@@ -10497,35 +11321,37 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB72_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB72_4
-; GFX12-NEXT:  .LBB72_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB72_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB72_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB72_2
-; GFX12-NEXT:  .LBB72_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB72_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB72_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB72_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -10549,9 +11375,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB73_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB73_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -10560,11 +11386,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB73_3
-; GCN1-NEXT:    s_branch .LBB73_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB73_3
 ; GCN1-NEXT:  .LBB73_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB73_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB73_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB73_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -10577,7 +11409,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB73_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB73_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    s_waitcnt vmcnt(2)
@@ -10599,9 +11431,9 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB73_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB73_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -10610,11 +11442,17 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB73_3
-; GCN2-NEXT:    s_branch .LBB73_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB73_3
 ; GCN2-NEXT:  .LBB73_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB73_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB73_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB73_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -10626,7 +11464,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB73_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB73_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    s_waitcnt vmcnt(2)
@@ -10636,34 +11474,41 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
 ; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB73_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB73_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB73_3
-; GFX12-NEXT:    s_branch .LBB73_4
+; GFX12-NEXT:    s_branch .LBB73_3
 ; GFX12-NEXT:  .LBB73_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB73_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB73_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB73_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB73_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB73_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
@@ -10690,16 +11535,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB74_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB74_4
-; GCN1-NEXT:  .LBB74_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB74_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB74_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -10707,8 +11547,13 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB74_2
-; GCN1-NEXT:  .LBB74_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB74_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB74_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -10723,6 +11568,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB74_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xor_i64_offset:
@@ -10739,16 +11585,11 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB74_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB74_4
-; GCN2-NEXT:  .LBB74_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB74_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB74_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -10756,8 +11597,13 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB74_2
-; GCN2-NEXT:  .LBB74_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB74_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB74_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -10771,6 +11617,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB74_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_offset:
@@ -10781,23 +11628,24 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB74_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB74_4
-; GFX12-NEXT:  .LBB74_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB74_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB74_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB74_2
-; GFX12-NEXT:  .LBB74_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB74_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB74_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -10806,6 +11654,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB74_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -10829,9 +11678,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB75_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB75_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -10840,11 +11689,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB75_3
-; GCN1-NEXT:    s_branch .LBB75_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB75_3
 ; GCN1-NEXT:  .LBB75_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB75_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB75_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB75_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s2, -1
@@ -10859,7 +11714,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_xor_b32_e32 v5, s1, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB75_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB75_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -10880,9 +11735,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB75_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB75_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -10891,11 +11746,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB75_3
-; GCN2-NEXT:    s_branch .LBB75_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB75_3
 ; GCN2-NEXT:  .LBB75_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB75_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB75_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB75_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -10909,7 +11770,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_xor_b32_e32 v5, s1, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB75_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB75_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -10920,25 +11781,32 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB75_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB75_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB75_3
-; GFX12-NEXT:    s_branch .LBB75_4
+; GFX12-NEXT:    s_branch .LBB75_3
 ; GFX12-NEXT:  .LBB75_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB75_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB75_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB75_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -10947,7 +11815,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_xor_b32_e32 v3, s1, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v2, s0, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB75_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB75_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -10977,16 +11845,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB76_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB76_4
-; GCN1-NEXT:  .LBB76_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB76_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB76_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -10994,8 +11857,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB76_2
-; GCN1-NEXT:  .LBB76_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB76_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB76_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11010,36 +11878,33 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB76_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xor_i64_addr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB76_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB76_4
-; GCN2-NEXT:  .LBB76_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB76_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB76_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -11047,8 +11912,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB76_2
-; GCN2-NEXT:  .LBB76_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB76_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB76_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -11062,6 +11932,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB76_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_addr64_offset:
@@ -11076,24 +11947,25 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB76_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB76_4
-; GFX12-NEXT:  .LBB76_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB76_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB76_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB76_2
-; GFX12-NEXT:  .LBB76_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB76_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB76_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11102,6 +11974,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB76_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -11128,9 +12001,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB77_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB77_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -11139,11 +12012,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB77_3
-; GCN1-NEXT:    s_branch .LBB77_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB77_3
 ; GCN1-NEXT:  .LBB77_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB77_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB77_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB77_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11158,7 +12037,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_xor_b32_e32 v5, s9, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB77_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB77_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -11172,8 +12051,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -11181,9 +12060,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB77_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB77_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -11192,11 +12071,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB77_3
-; GCN2-NEXT:    s_branch .LBB77_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB77_3
 ; GCN2-NEXT:  .LBB77_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB77_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB77_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB77_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -11210,7 +12095,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_xor_b32_e32 v5, s9, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB77_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB77_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -11226,21 +12111,28 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB77_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB77_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB77_3
-; GFX12-NEXT:    s_branch .LBB77_4
+; GFX12-NEXT:    s_branch .LBB77_3
 ; GFX12-NEXT:  .LBB77_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB77_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB77_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB77_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11249,7 +12141,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_xor_b32_e32 v3, s5, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v2, s4, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB77_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB77_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -11274,16 +12166,11 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB78_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB78_4
-; GCN1-NEXT:  .LBB78_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB78_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB78_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -11291,8 +12178,13 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB78_2
-; GCN1-NEXT:  .LBB78_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB78_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB78_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s4, -1
@@ -11307,6 +12199,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB78_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xor_i64:
@@ -11321,16 +12214,11 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB78_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB78_4
-; GCN2-NEXT:  .LBB78_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB78_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB78_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -11338,8 +12226,13 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB78_2
-; GCN2-NEXT:  .LBB78_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB78_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB78_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -11353,6 +12246,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB78_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64:
@@ -11361,24 +12255,25 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB78_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB78_4
-; GFX12-NEXT:  .LBB78_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB78_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB78_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB78_2
-; GFX12-NEXT:  .LBB78_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB78_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB78_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -11386,6 +12281,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB78_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
@@ -11406,9 +12302,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB79_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB79_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -11417,11 +12313,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB79_3
-; GCN1-NEXT:    s_branch .LBB79_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB79_3
 ; GCN1-NEXT:  .LBB79_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB79_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB79_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB79_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s4, -1
@@ -11436,7 +12338,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_xor_b32_e32 v5, s1, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB79_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB79_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -11455,9 +12357,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB79_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB79_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -11466,11 +12368,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB79_3
-; GCN2-NEXT:    s_branch .LBB79_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB79_3
 ; GCN2-NEXT:  .LBB79_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB79_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB79_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB79_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -11484,7 +12392,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_xor_b32_e32 v5, s1, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB79_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB79_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -11498,21 +12406,28 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB79_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB79_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB79_3
-; GFX12-NEXT:    s_branch .LBB79_4
+; GFX12-NEXT:    s_branch .LBB79_3
 ; GFX12-NEXT:  .LBB79_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB79_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB79_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB79_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -11520,7 +12435,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_xor_b32_e32 v3, s1, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v2, s0, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB79_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB79_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -11547,16 +12462,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB80_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB80_4
-; GCN1-NEXT:  .LBB80_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB80_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB80_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -11564,8 +12474,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB80_2
-; GCN1-NEXT:  .LBB80_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB80_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB80_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11580,6 +12495,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB80_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_xor_i64_addr64:
@@ -11598,16 +12514,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB80_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB80_4
-; GCN2-NEXT:  .LBB80_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB80_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB80_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -11615,8 +12526,13 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB80_2
-; GCN2-NEXT:  .LBB80_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB80_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB80_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
@@ -11630,6 +12546,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB80_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_xor_i64_addr64:
@@ -11637,30 +12554,31 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB80_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB80_4
-; GFX12-NEXT:  .LBB80_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB80_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB80_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB80_2
-; GFX12-NEXT:  .LBB80_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB80_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB80_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11669,6 +12587,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB80_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -11692,9 +12611,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB81_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB81_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -11703,11 +12622,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB81_3
-; GCN1-NEXT:    s_branch .LBB81_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB81_3
 ; GCN1-NEXT:  .LBB81_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB81_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB81_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB81_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11722,7 +12647,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_xor_b32_e32 v5, s9, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB81_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB81_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -11743,9 +12668,9 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB81_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB81_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -11754,11 +12679,17 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB81_3
-; GCN2-NEXT:    s_branch .LBB81_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB81_3
 ; GCN2-NEXT:  .LBB81_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB81_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB81_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB81_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -11772,7 +12703,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_xor_b32_e32 v5, s9, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB81_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB81_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -11781,27 +12712,34 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_xor_i64_ret_addr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB81_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB81_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB81_3
-; GFX12-NEXT:    s_branch .LBB81_4
+; GFX12-NEXT:    s_branch .LBB81_3
 ; GFX12-NEXT:  .LBB81_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB81_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB81_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB81_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -11810,7 +12748,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_xor_b32_e32 v3, s5, v1
 ; GFX12-NEXT:    v_xor_b32_e32 v2, s4, v0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB81_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB81_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -13196,16 +14134,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB107_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB107_4
-; GCN1-NEXT:  .LBB107_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB107_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB107_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -13213,8 +14146,13 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB107_2
-; GCN1-NEXT:  .LBB107_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB107_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB107_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -13232,6 +14170,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v0, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB107_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_inc_i64_offset:
@@ -13248,16 +14187,11 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB107_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB107_4
-; GCN2-NEXT:  .LBB107_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB107_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB107_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -13265,8 +14199,13 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB107_2
-; GCN2-NEXT:  .LBB107_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB107_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB107_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -13283,6 +14222,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v0, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB107_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_inc_i64_offset:
@@ -13293,23 +14233,24 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB107_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB107_4
-; GFX12-NEXT:  .LBB107_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB107_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB107_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB107_2
-; GFX12-NEXT:  .LBB107_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB107_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB107_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -13321,6 +14262,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB107_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -13344,9 +14286,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB108_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB108_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -13355,11 +14297,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB108_3
-; GCN1-NEXT:    s_branch .LBB108_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB108_3
 ; GCN1-NEXT:  .LBB108_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB108_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB108_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB108_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s2, -1
@@ -13377,7 +14325,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB108_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB108_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -13398,9 +14346,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB108_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB108_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -13409,11 +14357,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB108_3
-; GCN2-NEXT:    s_branch .LBB108_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB108_3
 ; GCN2-NEXT:  .LBB108_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB108_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB108_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB108_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -13430,7 +14384,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB108_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB108_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -13441,25 +14395,32 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s3, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB108_2
+; GFX12-NEXT:    s_cmp_eq_u32 s3, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB108_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB108_3
-; GFX12-NEXT:    s_branch .LBB108_4
+; GFX12-NEXT:    s_branch .LBB108_3
 ; GFX12-NEXT:  .LBB108_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB108_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB108_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB108_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s2, -1
@@ -13471,7 +14432,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB108_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB108_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -13501,16 +14462,11 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB109_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB109_4
-; GCN1-NEXT:  .LBB109_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB109_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB109_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -13518,8 +14474,13 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB109_2
-; GCN1-NEXT:  .LBB109_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB109_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB109_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -13537,36 +14498,33 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v0, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB109_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_inc_i64_incr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB109_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB109_4
-; GCN2-NEXT:  .LBB109_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB109_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB109_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -13574,8 +14532,13 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB109_2
-; GCN2-NEXT:  .LBB109_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB109_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB109_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -13592,6 +14555,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v0, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB109_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_inc_i64_incr64_offset:
@@ -13606,24 +14570,25 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB109_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB109_4
-; GFX12-NEXT:  .LBB109_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB109_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB109_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB109_2
-; GFX12-NEXT:  .LBB109_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB109_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB109_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -13635,6 +14600,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB109_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -13661,9 +14627,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB110_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB110_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -13672,11 +14638,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB110_3
-; GCN1-NEXT:    s_branch .LBB110_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB110_3
 ; GCN1-NEXT:  .LBB110_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB110_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB110_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB110_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -13694,7 +14666,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB110_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB110_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -13708,8 +14680,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -13717,9 +14689,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB110_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB110_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -13728,11 +14700,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB110_3
-; GCN2-NEXT:    s_branch .LBB110_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB110_3
 ; GCN2-NEXT:  .LBB110_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB110_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB110_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB110_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -13749,7 +14727,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB110_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB110_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -13765,21 +14743,28 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB110_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB110_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB110_3
-; GFX12-NEXT:    s_branch .LBB110_4
+; GFX12-NEXT:    s_branch .LBB110_3
 ; GFX12-NEXT:  .LBB110_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB110_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB110_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB110_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -13791,7 +14776,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB110_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB110_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -13816,16 +14801,11 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB111_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB111_4
-; GCN1-NEXT:  .LBB111_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB111_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB111_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -13833,8 +14813,13 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB111_2
-; GCN1-NEXT:  .LBB111_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB111_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB111_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s4, -1
@@ -13852,6 +14837,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v0, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB111_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_inc_i64:
@@ -13866,16 +14852,11 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB111_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB111_4
-; GCN2-NEXT:  .LBB111_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB111_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB111_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -13883,8 +14864,13 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB111_2
-; GCN2-NEXT:  .LBB111_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB111_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB111_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -13901,6 +14887,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v0, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB111_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_inc_i64:
@@ -13909,24 +14896,25 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB111_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB111_4
-; GFX12-NEXT:  .LBB111_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB111_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB111_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB111_2
-; GFX12-NEXT:  .LBB111_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB111_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB111_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
@@ -13937,6 +14925,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB111_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
@@ -13957,9 +14946,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB112_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB112_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -13968,11 +14957,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB112_3
-; GCN1-NEXT:    s_branch .LBB112_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB112_3
 ; GCN1-NEXT:  .LBB112_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB112_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB112_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB112_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s2, s4, -1
@@ -13990,7 +14985,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB112_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB112_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14009,9 +15004,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB112_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB112_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -14020,11 +15015,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB112_3
-; GCN2-NEXT:    s_branch .LBB112_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB112_3
 ; GCN2-NEXT:  .LBB112_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB112_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB112_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB112_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -14041,7 +15042,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB112_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB112_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14055,21 +15056,28 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX12-NEXT:    s_cbranch_vccz .LBB112_2
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB112_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_mov_b32 s2, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB112_3
-; GFX12-NEXT:    s_branch .LBB112_4
+; GFX12-NEXT:    s_branch .LBB112_3
 ; GFX12-NEXT:  .LBB112_2:
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB112_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB112_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB112_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s2, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s2
@@ -14080,7 +15088,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s2
-; GFX12-NEXT:  .LBB112_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB112_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -14107,16 +15115,11 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB113_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB113_4
-; GCN1-NEXT:  .LBB113_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB113_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB113_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -14124,8 +15127,13 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB113_2
-; GCN1-NEXT:  .LBB113_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB113_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB113_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -14143,6 +15151,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v0, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB113_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_inc_i64_incr64:
@@ -14161,16 +15170,11 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB113_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB113_4
-; GCN2-NEXT:  .LBB113_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB113_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB113_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -14178,8 +15182,13 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB113_2
-; GCN2-NEXT:  .LBB113_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB113_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB113_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -14196,6 +15205,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v0, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB113_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_inc_i64_incr64:
@@ -14203,30 +15213,31 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB113_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB113_4
-; GFX12-NEXT:  .LBB113_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB113_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB113_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB113_2
-; GFX12-NEXT:  .LBB113_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB113_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB113_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -14238,6 +15249,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
+; GFX12-NEXT:  .LBB113_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -14261,9 +15273,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB114_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB114_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -14272,11 +15284,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB114_3
-; GCN1-NEXT:    s_branch .LBB114_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB114_3
 ; GCN1-NEXT:  .LBB114_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB114_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB114_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB114_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GCN1-NEXT:    s_cselect_b32 s0, s0, -1
@@ -14294,7 +15312,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB114_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB114_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14315,9 +15333,9 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB114_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB114_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -14326,11 +15344,17 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB114_3
-; GCN2-NEXT:    s_branch .LBB114_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB114_3
 ; GCN2-NEXT:  .LBB114_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB114_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB114_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB114_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -14347,7 +15371,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB114_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB114_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14356,27 +15380,34 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_inc_i64_ret_incr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB114_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB114_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB114_3
-; GFX12-NEXT:    s_branch .LBB114_4
+; GFX12-NEXT:    s_branch .LBB114_3
 ; GFX12-NEXT:  .LBB114_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB114_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB114_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB114_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
@@ -14388,7 +15419,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_cndmask_b32 v3, 0, v3 :: v_dual_cndmask_b32 v2, 0, v2
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s0
-; GFX12-NEXT:  .LBB114_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB114_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -14414,16 +15445,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_add_u32 s0, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB115_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB115_4
-; GCN1-NEXT:  .LBB115_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB115_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB115_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -14431,8 +15457,13 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB115_2
-; GCN1-NEXT:  .LBB115_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB115_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB115_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -14453,6 +15484,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB115_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_dec_i64_offset:
@@ -14469,16 +15501,11 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_add_u32 s0, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB115_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB115_4
-; GCN2-NEXT:  .LBB115_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB115_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB115_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -14486,8 +15513,13 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB115_2
-; GCN2-NEXT:  .LBB115_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB115_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB115_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -14507,6 +15539,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB115_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_dec_i64_offset:
@@ -14517,23 +15550,24 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB115_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB115_4
-; GFX12-NEXT:  .LBB115_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB115_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB115_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB115_2
-; GFX12-NEXT:  .LBB115_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB115_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB115_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s4, s0, -1
@@ -14550,6 +15584,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s4
+; GFX12-NEXT:  .LBB115_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr %out, i64 4
@@ -14573,9 +15608,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s2, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN1-NEXT:    s_cbranch_vccz .LBB116_2
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB116_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -14584,11 +15619,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB116_3
-; GCN1-NEXT:    s_branch .LBB116_4
+; GCN1-NEXT:    s_mov_b64 s[4:5], 0
+; GCN1-NEXT:    s_branch .LBB116_3
 ; GCN1-NEXT:  .LBB116_2:
+; GCN1-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB116_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB116_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB116_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -14610,7 +15651,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB116_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB116_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14631,9 +15672,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s2, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s3, s5, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s3, s8
-; GCN2-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GCN2-NEXT:    s_cbranch_vccz .LBB116_2
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB116_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -14642,11 +15683,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB116_3
-; GCN2-NEXT:    s_branch .LBB116_4
+; GCN2-NEXT:    s_mov_b64 s[4:5], 0
+; GCN2-NEXT:    s_branch .LBB116_3
 ; GCN2-NEXT:  .LBB116_2:
+; GCN2-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB116_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB116_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN2-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB116_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s2, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -14667,7 +15714,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB116_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB116_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14678,25 +15725,32 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[4:5], 32
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB116_2
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB116_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB116_3
-; GFX12-NEXT:    s_branch .LBB116_4
+; GFX12-NEXT:    s_branch .LBB116_3
 ; GFX12-NEXT:  .LBB116_2:
+; GFX12-NEXT:    s_mov_b32 s4, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB116_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB116_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB116_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s4, s0, -1
@@ -14713,7 +15767,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_cndmask_b32_e64 v3, v3, s3, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, s2, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s4
-; GFX12-NEXT:  .LBB116_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB116_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -14743,16 +15797,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB117_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB117_4
-; GCN1-NEXT:  .LBB117_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB117_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB117_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -14760,8 +15809,13 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB117_2
-; GCN1-NEXT:  .LBB117_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB117_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB117_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -14782,36 +15836,33 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB117_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_dec_i64_decr64_offset:
 ; GCN2:       ; %bb.0: ; %entry
 ; GCN2-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; GCN2-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GCN2-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
+; GCN2-NEXT:    s_mov_b32 s90, -1
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0xfc
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
+; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB117_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB117_4
-; GCN2-NEXT:  .LBB117_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB117_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB117_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -14819,8 +15870,13 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB117_2
-; GCN2-NEXT:  .LBB117_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB117_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB117_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -14840,6 +15896,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB117_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_dec_i64_decr64_offset:
@@ -14854,24 +15911,25 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB117_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB117_4
-; GFX12-NEXT:  .LBB117_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB117_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB117_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB117_2
-; GFX12-NEXT:  .LBB117_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB117_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB117_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s4, s0, -1
@@ -14888,6 +15946,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
 ; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s4
+; GFX12-NEXT:  .LBB117_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -14914,9 +15973,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    s_add_u32 s0, s0, 32
 ; GCN1-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB118_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB118_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -14925,11 +15984,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB118_3
-; GCN1-NEXT:    s_branch .LBB118_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB118_3
 ; GCN1-NEXT:  .LBB118_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB118_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB118_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB118_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -14951,7 +16016,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB118_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB118_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -14965,8 +16030,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_mov_b32 s91, 0xe80000
 ; GCN2-NEXT:    s_add_u32 s88, s88, s9
 ; GCN2-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
+; GCN2-NEXT:    s_load_dword s2, s[2:3], 0x104
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
@@ -14974,9 +16039,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    s_add_u32 s0, s0, 32
 ; GCN2-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB118_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB118_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -14985,11 +16050,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB118_3
-; GCN2-NEXT:    s_branch .LBB118_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB118_3
 ; GCN2-NEXT:  .LBB118_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB118_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB118_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB118_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -15010,7 +16081,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB118_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB118_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -15026,21 +16097,28 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 32
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB118_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB118_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB118_3
-; GFX12-NEXT:    s_branch .LBB118_4
+; GFX12-NEXT:    s_branch .LBB118_3
 ; GFX12-NEXT:  .LBB118_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB118_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB118_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB118_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s6, s0, -1
@@ -15057,7 +16135,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
 ; GFX12-NEXT:    v_cndmask_b32_e64 v3, v3, s5, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s6
-; GFX12-NEXT:  .LBB118_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB118_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -15082,16 +16160,11 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN1-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN1-NEXT:    s_mov_b64 s[0:1], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB119_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN1-NEXT:    s_cbranch_vccz .LBB119_4
-; GCN1-NEXT:  .LBB119_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB119_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB119_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -15099,8 +16172,13 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB119_2
-; GCN1-NEXT:  .LBB119_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[0:1], 0
+; GCN1-NEXT:  .LBB119_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN1-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB119_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[0:1], s[0:1], exec
@@ -15121,6 +16199,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB119_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_dec_i64:
@@ -15135,16 +16214,11 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s0
-; GCN2-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
 ; GCN2-NEXT:    s_mov_b64 s[0:1], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB119_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN2-NEXT:    s_cbranch_vccz .LBB119_4
-; GCN2-NEXT:  .LBB119_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB119_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB119_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -15152,8 +16226,13 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB119_2
-; GCN2-NEXT:  .LBB119_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[0:1], 0
+; GCN2-NEXT:  .LBB119_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN2-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB119_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -15173,6 +16252,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB119_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_dec_i64:
@@ -15181,24 +16261,25 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB119_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB119_4
-; GFX12-NEXT:  .LBB119_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB119_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB119_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB119_2
-; GFX12-NEXT:  .LBB119_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB119_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB119_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s4, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s4
@@ -15214,6 +16295,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s4
+; GFX12-NEXT:  .LBB119_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
@@ -15234,9 +16316,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB120_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB120_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s1
@@ -15245,11 +16327,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB120_3
-; GCN1-NEXT:    s_branch .LBB120_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB120_3
 ; GCN1-NEXT:  .LBB120_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB120_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB120_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB120_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s1
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -15271,7 +16359,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB120_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB120_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -15290,9 +16378,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    s_addc_u32 s89, s89, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s8
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB120_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB120_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s1
@@ -15301,11 +16389,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB120_3
-; GCN2-NEXT:    s_branch .LBB120_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB120_3
 ; GCN2-NEXT:  .LBB120_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB120_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB120_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB120_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s2, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s2
@@ -15326,7 +16420,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB120_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB120_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -15340,21 +16434,28 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_u32 s5, s1
-; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_vccz .LBB120_2
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB120_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB120_3
-; GFX12-NEXT:    s_branch .LBB120_4
+; GFX12-NEXT:    s_branch .LBB120_3
 ; GFX12-NEXT:  .LBB120_2:
+; GFX12-NEXT:    s_mov_b32 s0, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB120_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB120_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB120_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX12-NEXT:    s_cselect_b32 s4, s4, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s4
@@ -15371,7 +16472,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
 ; GFX12-NEXT:    v_cndmask_b32_e64 v3, v3, s3, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, s2, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s4
-; GFX12-NEXT:  .LBB120_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB120_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
@@ -15398,16 +16499,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN1-NEXT:    s_mov_b64 s[2:3], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB121_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB121_4
-; GCN1-NEXT:  .LBB121_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_endpgm
-; GCN1-NEXT:  .LBB121_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB121_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s1
@@ -15415,8 +16511,13 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB121_2
-; GCN1-NEXT:  .LBB121_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:  .LBB121_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB121_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s6
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -15437,6 +16538,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT:  .LBB121_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_endpgm
 ;
 ; GCN2-LABEL: atomic_dec_i64_decr64:
@@ -15455,16 +16557,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
 ; GCN2-NEXT:    s_mov_b64 s[2:3], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB121_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB121_4
-; GCN2-NEXT:  .LBB121_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_endpgm
-; GCN2-NEXT:  .LBB121_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB121_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s1
@@ -15472,8 +16569,13 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB121_2
-; GCN2-NEXT:  .LBB121_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:  .LBB121_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB121_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -15493,6 +16595,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT:  .LBB121_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: atomic_dec_i64_decr64:
@@ -15500,30 +16603,31 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
-; GFX12-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s5
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
 ; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    s_cbranch_vccnz .LBB121_3
-; GFX12-NEXT:  ; %bb.1: ; %Flow
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccz .LBB121_4
-; GFX12-NEXT:  .LBB121_2: ; %atomicrmw.phi
-; GFX12-NEXT:    s_endpgm
-; GFX12-NEXT:  .LBB121_3: ; %atomicrmw.global
+; GFX12-NEXT:    s_cbranch_scc0 .LBB121_2
+; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execnz .LBB121_2
-; GFX12-NEXT:  .LBB121_4: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB121_2: ; %Flow
+; GFX12-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB121_4
+; GFX12-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s4, s0, -1
@@ -15540,6 +16644,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
 ; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s4
+; GFX12-NEXT:  .LBB121_4: ; %atomicrmw.phi
 ; GFX12-NEXT:    s_endpgm
 entry:
   %ptr = getelementptr i64, ptr %out, i64 %index
@@ -15563,9 +16668,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    s_add_u32 s0, s4, s0
 ; GCN1-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN1-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN1-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN1-NEXT:    s_cbranch_vccz .LBB122_2
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB122_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s8
@@ -15574,11 +16679,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB122_3
-; GCN1-NEXT:    s_branch .LBB122_4
+; GCN1-NEXT:    s_mov_b64 s[2:3], 0
+; GCN1-NEXT:    s_branch .LBB122_3
 ; GCN1-NEXT:  .LBB122_2:
+; GCN1-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB122_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB122_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN1-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB122_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v5, s8
 ; GCN1-NEXT:    s_and_b64 s[2:3], s[2:3], exec
@@ -15600,7 +16711,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN1-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[12:15], 0 offen
-; GCN1-NEXT:  .LBB122_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB122_5: ; %atomicrmw.end
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN1-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -15621,9 +16732,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    s_add_u32 s0, s4, s0
 ; GCN2-NEXT:    s_addc_u32 s1, s5, s1
 ; GCN2-NEXT:    s_cmp_eq_u32 s1, s2
-; GCN2-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN2-NEXT:    s_cbranch_vccz .LBB122_2
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB122_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s8
@@ -15632,11 +16743,17 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB122_3
-; GCN2-NEXT:    s_branch .LBB122_4
+; GCN2-NEXT:    s_mov_b64 s[2:3], 0
+; GCN2-NEXT:    s_branch .LBB122_3
 ; GCN2-NEXT:  .LBB122_2:
+; GCN2-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB122_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB122_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN2-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB122_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN2-NEXT:    s_cselect_b32 s0, s0, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s0
@@ -15657,7 +16774,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GCN2-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[88:91], 0 offen
-; GCN2-NEXT:  .LBB122_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB122_5: ; %atomicrmw.end
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN2-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -15666,27 +16783,34 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-LABEL: atomic_dec_i64_ret_decr64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b64 s[8:9], src_private_base
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
-; GFX12-NEXT:    s_cmp_eq_u32 s1, s9
-; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX12-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GFX12-NEXT:    s_cmp_eq_u32 s1, s7
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s6
-; GFX12-NEXT:    s_cbranch_vccz .LBB122_2
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB122_2
 ; GFX12-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s6, 0
 ; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    s_cbranch_execz .LBB122_3
-; GFX12-NEXT:    s_branch .LBB122_4
+; GFX12-NEXT:    s_branch .LBB122_3
 ; GFX12-NEXT:  .LBB122_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX12-NEXT:  .LBB122_3: ; %atomicrmw.private
+; GFX12-NEXT:  .LBB122_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB122_5
+; GFX12-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX12-NEXT:    s_cselect_b32 s6, s0, -1
@@ -15703,7 +16827,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
 ; GFX12-NEXT:    v_cndmask_b32_e64 v3, v3, s5, s0
 ; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s0
 ; GFX12-NEXT:    scratch_store_b64 off, v[2:3], s6
-; GFX12-NEXT:  .LBB122_4: ; %atomicrmw.end
+; GFX12-NEXT:  .LBB122_5: ; %atomicrmw.end
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index d7bd4b1e4918e8..7aea1dee84ed31 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -467,16 +467,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB4_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB4_4
-; GCN1-NEXT:  .LBB4_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -484,8 +479,13 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB4_2
-; GCN1-NEXT:  .LBB4_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB4_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -496,6 +496,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB4_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -506,16 +507,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB4_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB4_4
-; GCN2-NEXT:  .LBB4_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -523,8 +519,13 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB4_2
-; GCN2-NEXT:  .LBB4_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB4_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB4_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -534,6 +535,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB4_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -542,16 +544,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB4_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB4_4
-; GCN3-NEXT:  .LBB4_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB4_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -559,8 +556,13 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB4_2
-; GCN3-NEXT:  .LBB4_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB4_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB4_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s7
@@ -568,6 +570,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN3-NEXT:  .LBB4_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
@@ -584,16 +587,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB5_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB5_4
-; GCN1-NEXT:  .LBB5_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB5_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB5_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -601,8 +599,13 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB5_2
-; GCN1-NEXT:  .LBB5_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB5_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB5_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -613,6 +616,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB5_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -625,16 +629,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB5_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB5_4
-; GCN2-NEXT:  .LBB5_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB5_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB5_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -642,8 +641,13 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB5_2
-; GCN2-NEXT:  .LBB5_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB5_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB5_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -653,6 +657,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB5_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -663,16 +668,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB5_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB5_4
-; GCN3-NEXT:  .LBB5_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB5_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB5_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -680,8 +680,13 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB5_2
-; GCN3-NEXT:  .LBB5_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB5_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB5_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s7
@@ -689,6 +694,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN3-NEXT:  .LBB5_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -704,9 +710,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB6_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB6_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -715,11 +721,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB6_3
-; GCN1-NEXT:    s_branch .LBB6_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB6_3
 ; GCN1-NEXT:  .LBB6_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB6_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB6_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB6_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -732,7 +744,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB6_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB6_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -743,9 +755,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB6_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB6_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -754,11 +766,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB6_3
-; GCN2-NEXT:    s_branch .LBB6_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB6_3
 ; GCN2-NEXT:  .LBB6_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB6_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB6_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB6_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -770,7 +788,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB6_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB6_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -779,9 +797,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB6_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB6_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -790,11 +808,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB6_3
-; GCN3-NEXT:    s_branch .LBB6_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB6_3
 ; GCN3-NEXT:  .LBB6_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB6_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB6_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB6_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -804,7 +828,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB6_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB6_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
@@ -821,9 +845,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -832,11 +856,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB7_3
-; GCN1-NEXT:    s_branch .LBB7_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB7_3
 ; GCN1-NEXT:  .LBB7_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB7_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB7_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB7_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -849,7 +879,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB7_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB7_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -862,9 +892,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -873,11 +903,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB7_3
-; GCN2-NEXT:    s_branch .LBB7_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB7_3
 ; GCN2-NEXT:  .LBB7_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB7_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB7_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB7_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -889,7 +925,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB7_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB7_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -900,9 +936,9 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB7_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB7_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -911,11 +947,17 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB7_3
-; GCN3-NEXT:    s_branch .LBB7_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB7_3
 ; GCN3-NEXT:  .LBB7_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB7_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB7_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB7_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -925,7 +967,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB7_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB7_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -1627,16 +1669,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB14_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB14_4
-; GCN1-NEXT:  .LBB14_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB14_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB14_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -1644,8 +1681,13 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB14_2
-; GCN1-NEXT:  .LBB14_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB14_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB14_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -1656,6 +1698,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1666,16 +1709,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB14_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB14_4
-; GCN2-NEXT:  .LBB14_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB14_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB14_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -1683,8 +1721,13 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB14_2
-; GCN2-NEXT:  .LBB14_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB14_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB14_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -1694,6 +1737,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1702,16 +1746,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB14_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB14_4
-; GCN3-NEXT:  .LBB14_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB14_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB14_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -1719,8 +1758,13 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB14_2
-; GCN3-NEXT:  .LBB14_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB14_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB14_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s7
@@ -1728,6 +1772,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN3-NEXT:  .LBB14_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
@@ -1744,16 +1789,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB15_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB15_4
-; GCN1-NEXT:  .LBB15_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB15_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB15_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -1761,8 +1801,13 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB15_2
-; GCN1-NEXT:  .LBB15_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB15_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB15_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -1773,6 +1818,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN1-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB15_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1785,16 +1831,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB15_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB15_4
-; GCN2-NEXT:  .LBB15_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB15_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB15_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -1802,8 +1843,13 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB15_2
-; GCN2-NEXT:  .LBB15_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB15_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB15_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
@@ -1813,6 +1859,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN2-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB15_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1823,16 +1870,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB15_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB15_4
-; GCN3-NEXT:  .LBB15_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB15_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB15_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -1840,8 +1882,13 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB15_2
-; GCN3-NEXT:  .LBB15_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB15_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB15_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s7
@@ -1849,6 +1896,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GCN3-NEXT:  .LBB15_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i32 4
@@ -1864,9 +1912,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB16_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -1875,11 +1923,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB16_3
-; GCN1-NEXT:    s_branch .LBB16_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB16_3
 ; GCN1-NEXT:  .LBB16_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB16_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB16_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB16_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -1892,7 +1946,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB16_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB16_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1903,9 +1957,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB16_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -1914,11 +1968,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB16_3
-; GCN2-NEXT:    s_branch .LBB16_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB16_3
 ; GCN2-NEXT:  .LBB16_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB16_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB16_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB16_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -1930,7 +1990,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB16_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB16_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1939,9 +1999,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB16_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1950,11 +2010,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB16_3
-; GCN3-NEXT:    s_branch .LBB16_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB16_3
 ; GCN3-NEXT:  .LBB16_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB16_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB16_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB16_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -1964,7 +2030,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB16_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB16_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, double %in seq_cst
@@ -1981,9 +2047,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB17_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB17_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -1992,11 +2058,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB17_3
-; GCN1-NEXT:    s_branch .LBB17_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB17_3
 ; GCN1-NEXT:  .LBB17_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB17_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB17_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB17_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s6
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -2009,7 +2081,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN1-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB17_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB17_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2022,9 +2094,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB17_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB17_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -2033,11 +2105,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB17_3
-; GCN2-NEXT:    s_branch .LBB17_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB17_3
 ; GCN2-NEXT:  .LBB17_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB17_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB17_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB17_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -2049,7 +2127,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN2-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB17_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB17_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2060,9 +2138,9 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB17_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB17_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2071,11 +2149,17 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB17_3
-; GCN3-NEXT:    s_branch .LBB17_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB17_3
 ; GCN3-NEXT:  .LBB17_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB17_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB17_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB17_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -2085,7 +2169,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB17_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB17_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i32 4
@@ -2845,16 +2929,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB24_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB24_4
-; GCN1-NEXT:  .LBB24_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB24_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -2862,8 +2941,13 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB24_2
-; GCN1-NEXT:  .LBB24_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB24_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB24_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -2879,6 +2963,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB24_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2889,16 +2974,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB24_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB24_4
-; GCN2-NEXT:  .LBB24_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB24_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -2906,8 +2986,13 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB24_2
-; GCN2-NEXT:  .LBB24_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB24_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB24_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -2922,6 +3007,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB24_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2930,16 +3016,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB24_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB24_4
-; GCN3-NEXT:  .LBB24_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB24_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB24_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -2947,8 +3028,13 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB24_2
-; GCN3-NEXT:  .LBB24_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB24_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB24_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -2961,6 +3047,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB24_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
@@ -2977,16 +3064,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB25_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB25_4
-; GCN1-NEXT:  .LBB25_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB25_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB25_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -2994,8 +3076,13 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB25_2
-; GCN1-NEXT:  .LBB25_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB25_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB25_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -3011,6 +3098,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB25_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3023,16 +3111,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB25_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB25_4
-; GCN2-NEXT:  .LBB25_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB25_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB25_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -3040,8 +3123,13 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB25_2
-; GCN2-NEXT:  .LBB25_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB25_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB25_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -3056,6 +3144,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB25_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3066,16 +3155,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB25_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB25_4
-; GCN3-NEXT:  .LBB25_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB25_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB25_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -3083,8 +3167,13 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB25_2
-; GCN3-NEXT:  .LBB25_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB25_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB25_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -3097,6 +3186,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB25_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -3112,9 +3202,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB26_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB26_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -3123,11 +3213,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB26_3
-; GCN1-NEXT:    s_branch .LBB26_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB26_3
 ; GCN1-NEXT:  .LBB26_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB26_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB26_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB26_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -3143,7 +3239,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB26_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB26_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3154,9 +3250,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB26_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB26_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -3165,11 +3261,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB26_3
-; GCN2-NEXT:    s_branch .LBB26_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB26_3
 ; GCN2-NEXT:  .LBB26_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB26_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB26_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB26_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -3184,7 +3286,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB26_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB26_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3193,9 +3295,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB26_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB26_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3204,11 +3306,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB26_3
-; GCN3-NEXT:    s_branch .LBB26_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB26_3
 ; GCN3-NEXT:  .LBB26_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB26_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB26_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB26_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -3221,7 +3329,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB26_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB26_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr %ptr, i64 %in seq_cst
@@ -3238,9 +3346,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB27_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB27_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -3249,11 +3357,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB27_3
-; GCN1-NEXT:    s_branch .LBB27_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB27_3
 ; GCN1-NEXT:  .LBB27_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB27_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB27_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB27_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -3269,7 +3383,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB27_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB27_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3282,9 +3396,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB27_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB27_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -3293,11 +3407,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB27_3
-; GCN2-NEXT:    s_branch .LBB27_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB27_3
 ; GCN2-NEXT:  .LBB27_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB27_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB27_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB27_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -3312,7 +3432,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB27_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB27_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3323,9 +3443,9 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB27_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB27_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -3334,11 +3454,17 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB27_3
-; GCN3-NEXT:    s_branch .LBB27_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB27_3
 ; GCN3-NEXT:  .LBB27_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB27_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB27_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB27_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -3351,7 +3477,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB27_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB27_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -4140,16 +4266,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB34_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB34_4
-; GCN1-NEXT:  .LBB34_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB34_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -4157,8 +4278,13 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB34_2
-; GCN1-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB34_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB34_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -4174,6 +4300,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB34_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4184,16 +4311,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB34_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB34_4
-; GCN2-NEXT:  .LBB34_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB34_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -4201,8 +4323,13 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB34_2
-; GCN2-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB34_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB34_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -4217,6 +4344,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB34_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4225,16 +4353,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB34_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB34_4
-; GCN3-NEXT:  .LBB34_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB34_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB34_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -4242,8 +4365,13 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB34_2
-; GCN3-NEXT:  .LBB34_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB34_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB34_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -4256,6 +4384,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB34_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
@@ -4272,16 +4401,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB35_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB35_4
-; GCN1-NEXT:  .LBB35_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB35_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB35_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -4289,8 +4413,13 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB35_2
-; GCN1-NEXT:  .LBB35_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB35_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB35_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -4306,6 +4435,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB35_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4318,16 +4448,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB35_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB35_4
-; GCN2-NEXT:  .LBB35_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB35_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB35_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -4335,8 +4460,13 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB35_2
-; GCN2-NEXT:  .LBB35_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB35_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB35_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -4351,6 +4481,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB35_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4361,16 +4492,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB35_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB35_4
-; GCN3-NEXT:  .LBB35_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB35_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB35_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -4378,8 +4504,13 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB35_2
-; GCN3-NEXT:  .LBB35_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB35_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB35_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -4392,6 +4523,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB35_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -4407,9 +4539,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB36_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB36_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -4418,11 +4550,17 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB36_3
-; GCN1-NEXT:    s_branch .LBB36_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB36_3
 ; GCN1-NEXT:  .LBB36_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB36_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB36_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB36_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
@@ -4438,7 +4576,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB36_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB36_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4449,9 +4587,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB36_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB36_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -4460,11 +4598,17 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB36_3
-; GCN2-NEXT:    s_branch .LBB36_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB36_3
 ; GCN2-NEXT:  .LBB36_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB36_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB36_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB36_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -4479,7 +4623,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB36_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB36_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4488,9 +4632,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB36_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB36_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4499,11 +4643,17 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB36_3
-; GCN3-NEXT:    s_branch .LBB36_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB36_3
 ; GCN3-NEXT:  .LBB36_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB36_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB36_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB36_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -4516,7 +4666,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB36_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB36_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
@@ -4533,9 +4683,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB37_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB37_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -4544,11 +4694,17 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB37_3
-; GCN1-NEXT:    s_branch .LBB37_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB37_3
 ; GCN1-NEXT:  .LBB37_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB37_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB37_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB37_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s7
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
@@ -4564,7 +4720,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN1-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB37_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB37_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4577,9 +4733,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB37_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB37_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -4588,11 +4744,17 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB37_3
-; GCN2-NEXT:    s_branch .LBB37_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB37_3
 ; GCN2-NEXT:  .LBB37_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB37_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB37_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB37_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -4607,7 +4769,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    v_subb_u32_e32 v4, vcc, v1, v4, vcc
 ; GCN2-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB37_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB37_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4618,9 +4780,9 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB37_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB37_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -4629,11 +4791,17 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB37_3
-; GCN3-NEXT:    s_branch .LBB37_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB37_3
 ; GCN3-NEXT:  .LBB37_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB37_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB37_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB37_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -4646,7 +4814,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB37_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB37_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -5435,16 +5603,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB44_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB44_4
-; GCN1-NEXT:  .LBB44_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB44_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -5452,8 +5615,13 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB44_2
-; GCN1-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB44_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB44_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s4, -1
@@ -5468,6 +5636,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB44_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5478,16 +5647,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB44_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB44_4
-; GCN2-NEXT:  .LBB44_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB44_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -5495,8 +5659,13 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB44_2
-; GCN2-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB44_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB44_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -5510,6 +5679,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB44_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5518,16 +5688,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB44_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB44_4
-; GCN3-NEXT:  .LBB44_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB44_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB44_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -5535,8 +5700,13 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB44_2
-; GCN3-NEXT:  .LBB44_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB44_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB44_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -5548,6 +5718,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    v_and_b32_e32 v2, s6, v2
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB44_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
@@ -5564,16 +5735,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB45_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB45_4
-; GCN1-NEXT:  .LBB45_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB45_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB45_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -5581,8 +5747,13 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB45_2
-; GCN1-NEXT:  .LBB45_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB45_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB45_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s34, -1
@@ -5597,6 +5768,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB45_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5609,16 +5781,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB45_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB45_4
-; GCN2-NEXT:  .LBB45_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB45_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB45_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -5626,8 +5793,13 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB45_2
-; GCN2-NEXT:  .LBB45_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB45_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB45_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -5641,6 +5813,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_and_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB45_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5651,16 +5824,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB45_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB45_4
-; GCN3-NEXT:  .LBB45_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB45_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB45_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -5668,8 +5836,13 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB45_2
-; GCN3-NEXT:  .LBB45_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB45_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB45_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -5681,6 +5854,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_and_b32_e32 v2, s6, v2
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB45_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -5696,9 +5870,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB46_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB46_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -5707,11 +5881,17 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB46_3
-; GCN1-NEXT:    s_branch .LBB46_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB46_3
 ; GCN1-NEXT:  .LBB46_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB46_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB46_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB46_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s4, -1
@@ -5726,7 +5906,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    v_and_b32_e32 v5, s7, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB46_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB46_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5737,9 +5917,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB46_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB46_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -5748,11 +5928,17 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB46_3
-; GCN2-NEXT:    s_branch .LBB46_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB46_3
 ; GCN2-NEXT:  .LBB46_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB46_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB46_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB46_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -5766,7 +5952,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    v_and_b32_e32 v5, s7, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB46_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB46_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5775,9 +5961,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB46_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB46_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -5786,11 +5972,17 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB46_3
-; GCN3-NEXT:    s_branch .LBB46_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB46_3
 ; GCN3-NEXT:  .LBB46_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB46_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB46_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB46_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -5802,7 +5994,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    v_and_b32_e32 v4, s6, v0
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB46_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB46_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr %ptr, i64 %in seq_cst
@@ -5819,9 +6011,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB47_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB47_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -5830,11 +6022,17 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB47_3
-; GCN1-NEXT:    s_branch .LBB47_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB47_3
 ; GCN1-NEXT:  .LBB47_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB47_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB47_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB47_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s34, -1
@@ -5849,7 +6047,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    v_and_b32_e32 v5, s7, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB47_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB47_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5862,9 +6060,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB47_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB47_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -5873,11 +6071,17 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB47_3
-; GCN2-NEXT:    s_branch .LBB47_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB47_3
 ; GCN2-NEXT:  .LBB47_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB47_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB47_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB47_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -5891,7 +6095,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    v_and_b32_e32 v5, s7, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB47_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB47_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5902,9 +6106,9 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB47_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB47_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -5913,11 +6117,17 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB47_3
-; GCN3-NEXT:    s_branch .LBB47_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB47_3
 ; GCN3-NEXT:  .LBB47_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB47_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB47_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB47_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -5929,7 +6139,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    v_and_b32_e32 v4, s6, v0
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB47_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB47_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -7156,10 +7366,10 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB56_3
+; GCN1-NEXT:    s_cbranch_scc1 .LBB56_3
 ; GCN1-NEXT:  ; %bb.1: ; %Flow3
 ; GCN1-NEXT:    s_and_b64 vcc, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_vccnz .LBB56_6
@@ -7221,10 +7431,10 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB56_3
+; GCN2-NEXT:    s_cbranch_scc1 .LBB56_3
 ; GCN2-NEXT:  ; %bb.1: ; %Flow3
 ; GCN2-NEXT:    s_and_b64 vcc, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_vccnz .LBB56_6
@@ -7283,10 +7493,10 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB56_3
+; GCN3-NEXT:    s_cbranch_scc1 .LBB56_3
 ; GCN3-NEXT:  ; %bb.1: ; %Flow3
 ; GCN3-NEXT:    s_and_b64 vcc, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_vccnz .LBB56_6
@@ -7346,10 +7556,10 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB57_3
+; GCN1-NEXT:    s_cbranch_scc1 .LBB57_3
 ; GCN1-NEXT:  ; %bb.1: ; %Flow3
 ; GCN1-NEXT:    s_and_b64 vcc, exec, s[36:37]
 ; GCN1-NEXT:    s_cbranch_vccnz .LBB57_6
@@ -7413,10 +7623,10 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB57_3
+; GCN2-NEXT:    s_cbranch_scc1 .LBB57_3
 ; GCN2-NEXT:  ; %bb.1: ; %Flow3
 ; GCN2-NEXT:    s_and_b64 vcc, exec, s[36:37]
 ; GCN2-NEXT:    s_cbranch_vccnz .LBB57_6
@@ -7477,10 +7687,10 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB57_3
+; GCN3-NEXT:    s_cbranch_scc1 .LBB57_3
 ; GCN3-NEXT:  ; %bb.1: ; %Flow3
 ; GCN3-NEXT:    s_and_b64 vcc, exec, s[36:37]
 ; GCN3-NEXT:    s_cbranch_vccnz .LBB57_6
@@ -7641,9 +7851,9 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB59_4
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB59_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
@@ -7704,9 +7914,9 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB59_4
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB59_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
@@ -7764,9 +7974,9 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB59_4
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB59_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s5
@@ -7825,9 +8035,9 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB60_4
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB60_4
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    s_add_u32 s36, s34, 4
 ; GCN1-NEXT:    s_addc_u32 s37, s35, 0
@@ -7890,9 +8100,9 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB60_4
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB60_4
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    s_add_u32 s36, s34, 4
 ; GCN2-NEXT:    s_addc_u32 s37, s35, 0
@@ -7952,9 +8162,9 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB60_4
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB60_4
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v3, s35
@@ -9019,16 +9229,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB68_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB68_4
-; GCN1-NEXT:  .LBB68_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB68_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -9036,8 +9241,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB68_2
-; GCN1-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB68_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB68_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s4, -1
@@ -9052,6 +9262,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN1-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB68_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9062,16 +9273,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB68_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB68_4
-; GCN2-NEXT:  .LBB68_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB68_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -9079,8 +9285,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB68_2
-; GCN2-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB68_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB68_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -9094,6 +9305,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN2-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB68_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9102,16 +9314,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB68_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB68_4
-; GCN3-NEXT:  .LBB68_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB68_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB68_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -9119,8 +9326,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB68_2
-; GCN3-NEXT:  .LBB68_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB68_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB68_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -9132,6 +9344,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN3-NEXT:    v_or_b32_e32 v2, s6, v2
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB68_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
@@ -9148,16 +9361,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB69_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB69_4
-; GCN1-NEXT:  .LBB69_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB69_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB69_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -9165,8 +9373,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB69_2
-; GCN1-NEXT:  .LBB69_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB69_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB69_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s34, -1
@@ -9181,6 +9394,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN1-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB69_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9193,16 +9407,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB69_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB69_4
-; GCN2-NEXT:  .LBB69_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB69_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB69_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -9210,8 +9419,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB69_2
-; GCN2-NEXT:  .LBB69_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB69_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB69_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -9225,6 +9439,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN2-NEXT:    v_or_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB69_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9235,16 +9450,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB69_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB69_4
-; GCN3-NEXT:  .LBB69_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB69_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB69_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -9252,8 +9462,13 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB69_2
-; GCN3-NEXT:  .LBB69_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB69_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB69_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -9265,6 +9480,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN3-NEXT:    v_or_b32_e32 v2, s6, v2
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB69_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -9280,9 +9496,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB70_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB70_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -9291,11 +9507,17 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB70_3
-; GCN1-NEXT:    s_branch .LBB70_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB70_3
 ; GCN1-NEXT:  .LBB70_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB70_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB70_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB70_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s4, -1
@@ -9310,7 +9532,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN1-NEXT:    v_or_b32_e32 v5, s7, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB70_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB70_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9321,9 +9543,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB70_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB70_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -9332,11 +9554,17 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB70_3
-; GCN2-NEXT:    s_branch .LBB70_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB70_3
 ; GCN2-NEXT:  .LBB70_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB70_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB70_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB70_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -9350,7 +9578,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN2-NEXT:    v_or_b32_e32 v5, s7, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB70_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB70_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9359,9 +9587,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB70_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB70_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -9370,11 +9598,17 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB70_3
-; GCN3-NEXT:    s_branch .LBB70_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB70_3
 ; GCN3-NEXT:  .LBB70_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB70_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB70_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB70_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -9386,7 +9620,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN3-NEXT:    v_or_b32_e32 v4, s6, v0
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB70_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB70_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr %ptr, i64 %in seq_cst
@@ -9403,9 +9637,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB71_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB71_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -9414,11 +9648,17 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB71_3
-; GCN1-NEXT:    s_branch .LBB71_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB71_3
 ; GCN1-NEXT:  .LBB71_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB71_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB71_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB71_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s34, -1
@@ -9433,7 +9673,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    v_or_b32_e32 v5, s7, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB71_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB71_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9446,9 +9686,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB71_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB71_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -9457,11 +9697,17 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB71_3
-; GCN2-NEXT:    s_branch .LBB71_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB71_3
 ; GCN2-NEXT:  .LBB71_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB71_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB71_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB71_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -9475,7 +9721,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    v_or_b32_e32 v5, s7, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB71_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB71_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9486,9 +9732,9 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB71_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB71_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -9497,11 +9743,17 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB71_3
-; GCN3-NEXT:    s_branch .LBB71_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB71_3
 ; GCN3-NEXT:  .LBB71_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB71_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB71_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB71_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -9513,7 +9765,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    v_or_b32_e32 v4, s6, v0
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB71_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB71_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -10302,16 +10554,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN1-NEXT:    s_mov_b64 s[34:35], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB78_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB78_4
-; GCN1-NEXT:  .LBB78_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB78_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB78_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -10319,8 +10566,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB78_2
-; GCN1-NEXT:  .LBB78_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:  .LBB78_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB78_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s4, -1
@@ -10335,6 +10587,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB78_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10345,16 +10598,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN2-NEXT:    s_mov_b64 s[34:35], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB78_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB78_4
-; GCN2-NEXT:  .LBB78_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB78_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB78_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -10362,8 +10610,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB78_2
-; GCN2-NEXT:  .LBB78_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:  .LBB78_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB78_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -10377,6 +10630,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB78_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10385,16 +10639,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
 ; GCN3-NEXT:    s_mov_b64 s[34:35], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB78_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB78_4
-; GCN3-NEXT:  .LBB78_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB78_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB78_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
@@ -10402,8 +10651,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB78_2
-; GCN3-NEXT:  .LBB78_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:  .LBB78_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB78_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -10415,6 +10669,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB78_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
@@ -10431,16 +10686,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN1-NEXT:    s_mov_b64 s[36:37], -1
-; GCN1-NEXT:    s_cbranch_vccnz .LBB79_3
-; GCN1-NEXT:  ; %bb.1: ; %Flow
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB79_4
-; GCN1-NEXT:  .LBB79_2: ; %atomicrmw.phi
-; GCN1-NEXT:    s_setpc_b64 s[30:31]
-; GCN1-NEXT:  .LBB79_3: ; %atomicrmw.global
+; GCN1-NEXT:    s_cbranch_scc0 .LBB79_2
+; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s35
@@ -10448,8 +10698,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execnz .LBB79_2
-; GCN1-NEXT:  .LBB79_4: ; %atomicrmw.private
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:  .LBB79_2: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB79_4
+; GCN1-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s34, -1
@@ -10464,6 +10719,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN1-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT:  .LBB79_4: ; %atomicrmw.phi
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10476,16 +10732,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN2-NEXT:    s_mov_b64 s[36:37], -1
-; GCN2-NEXT:    s_cbranch_vccnz .LBB79_3
-; GCN2-NEXT:  ; %bb.1: ; %Flow
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB79_4
-; GCN2-NEXT:  .LBB79_2: ; %atomicrmw.phi
-; GCN2-NEXT:    s_setpc_b64 s[30:31]
-; GCN2-NEXT:  .LBB79_3: ; %atomicrmw.global
+; GCN2-NEXT:    s_cbranch_scc0 .LBB79_2
+; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s35
@@ -10493,8 +10744,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execnz .LBB79_2
-; GCN2-NEXT:  .LBB79_4: ; %atomicrmw.private
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:  .LBB79_2: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB79_4
+; GCN2-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -10508,6 +10764,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_xor_b32_e32 v3, s7, v3
 ; GCN2-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT:  .LBB79_4: ; %atomicrmw.phi
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10518,16 +10775,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
 ; GCN3-NEXT:    s_mov_b64 s[36:37], -1
-; GCN3-NEXT:    s_cbranch_vccnz .LBB79_3
-; GCN3-NEXT:  ; %bb.1: ; %Flow
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB79_4
-; GCN3-NEXT:  .LBB79_2: ; %atomicrmw.phi
-; GCN3-NEXT:    s_setpc_b64 s[30:31]
-; GCN3-NEXT:  .LBB79_3: ; %atomicrmw.global
+; GCN3-NEXT:    s_cbranch_scc0 .LBB79_2
+; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s35
@@ -10535,8 +10787,13 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execnz .LBB79_2
-; GCN3-NEXT:  .LBB79_4: ; %atomicrmw.private
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:  .LBB79_2: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB79_4
+; GCN3-NEXT:  ; %bb.3: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
@@ -10548,6 +10805,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GCN3-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; GCN3-NEXT:  .LBB79_4: ; %atomicrmw.phi
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
@@ -10563,9 +10821,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN1-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT:    s_cbranch_vccz .LBB80_2
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB80_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -10574,11 +10832,17 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB80_3
-; GCN1-NEXT:    s_branch .LBB80_4
+; GCN1-NEXT:    s_mov_b64 s[34:35], 0
+; GCN1-NEXT:    s_branch .LBB80_3
 ; GCN1-NEXT:  .LBB80_2:
+; GCN1-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB80_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB80_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB80_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
 ; GCN1-NEXT:    s_and_b64 s[34:35], s[34:35], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s4, -1
@@ -10593,7 +10857,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    v_xor_b32_e32 v5, s7, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB80_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB80_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10604,9 +10868,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    s_load_dword s34, s[34:35], 0x0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s5, s34
-; GCN2-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT:    s_cbranch_vccz .LBB80_2
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB80_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -10615,11 +10879,17 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB80_3
-; GCN2-NEXT:    s_branch .LBB80_4
+; GCN2-NEXT:    s_mov_b64 s[34:35], 0
+; GCN2-NEXT:    s_branch .LBB80_3
 ; GCN2-NEXT:  .LBB80_2:
+; GCN2-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB80_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB80_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN2-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB80_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -10633,7 +10903,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    v_xor_b32_e32 v5, s7, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB80_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB80_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10642,9 +10912,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    s_mov_b64 s[34:35], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s5, s35
-; GCN3-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT:    s_cbranch_vccz .LBB80_2
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB80_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -10653,11 +10923,17 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB80_3
-; GCN3-NEXT:    s_branch .LBB80_4
+; GCN3-NEXT:    s_mov_b64 s[34:35], 0
+; GCN3-NEXT:    s_branch .LBB80_3
 ; GCN3-NEXT:  .LBB80_2:
+; GCN3-NEXT:    s_mov_b64 s[34:35], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB80_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB80_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[34:35], s[34:35], exec
+; GCN3-NEXT:    s_cselect_b32 s34, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s34, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB80_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s4, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -10669,7 +10945,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    v_xor_b32_e32 v4, s6, v0
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB80_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB80_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
@@ -10686,9 +10962,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN1-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN1-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT:    s_cbranch_vccz .LBB81_2
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc0 .LBB81_2
 ; GCN1-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -10697,11 +10973,17 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
-; GCN1-NEXT:    s_cbranch_execz .LBB81_3
-; GCN1-NEXT:    s_branch .LBB81_4
+; GCN1-NEXT:    s_mov_b64 s[36:37], 0
+; GCN1-NEXT:    s_branch .LBB81_3
 ; GCN1-NEXT:  .LBB81_2:
+; GCN1-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN1-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT:  .LBB81_3: ; %atomicrmw.private
+; GCN1-NEXT:  .LBB81_3: ; %Flow
+; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN1-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN1-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN1-NEXT:    s_cbranch_scc1 .LBB81_5
+; GCN1-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN1-NEXT:    v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
 ; GCN1-NEXT:    s_and_b64 s[36:37], s[36:37], exec
 ; GCN1-NEXT:    s_cselect_b32 s34, s34, -1
@@ -10716,7 +10998,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    v_xor_b32_e32 v5, s7, v1
 ; GCN1-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT:  .LBB81_4: ; %atomicrmw.end
+; GCN1-NEXT:  .LBB81_5: ; %atomicrmw.end
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10729,9 +11011,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN2-NEXT:    s_cmp_eq_u32 s35, s36
-; GCN2-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT:    s_cbranch_vccz .LBB81_2
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc0 .LBB81_2
 ; GCN2-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -10740,11 +11022,17 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
-; GCN2-NEXT:    s_cbranch_execz .LBB81_3
-; GCN2-NEXT:    s_branch .LBB81_4
+; GCN2-NEXT:    s_mov_b64 s[36:37], 0
+; GCN2-NEXT:    s_branch .LBB81_3
 ; GCN2-NEXT:  .LBB81_2:
+; GCN2-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN2-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT:  .LBB81_3: ; %atomicrmw.private
+; GCN2-NEXT:  .LBB81_3: ; %Flow
+; GCN2-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN2-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN2-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN2-NEXT:    s_cbranch_scc1 .LBB81_5
+; GCN2-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN2-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN2-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -10758,7 +11046,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    v_xor_b32_e32 v5, s7, v1
 ; GCN2-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT:  .LBB81_4: ; %atomicrmw.end
+; GCN2-NEXT:  .LBB81_5: ; %atomicrmw.end
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -10769,9 +11057,9 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN3-NEXT:    s_mov_b64 s[36:37], src_private_base
 ; GCN3-NEXT:    s_cmp_eq_u32 s35, s37
-; GCN3-NEXT:    s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT:    s_cbranch_vccz .LBB81_2
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc0 .LBB81_2
 ; GCN3-NEXT:  ; %bb.1: ; %atomicrmw.global
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -10780,11 +11068,17 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
-; GCN3-NEXT:    s_cbranch_execz .LBB81_3
-; GCN3-NEXT:    s_branch .LBB81_4
+; GCN3-NEXT:    s_mov_b64 s[36:37], 0
+; GCN3-NEXT:    s_branch .LBB81_3
 ; GCN3-NEXT:  .LBB81_2:
+; GCN3-NEXT:    s_mov_b64 s[36:37], -1
 ; GCN3-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT:  .LBB81_3: ; %atomicrmw.private
+; GCN3-NEXT:  .LBB81_3: ; %Flow
+; GCN3-NEXT:    s_and_b64 s[36:37], s[36:37], exec
+; GCN3-NEXT:    s_cselect_b32 s36, 1, 0
+; GCN3-NEXT:    s_cmp_lg_u32 s36, 1
+; GCN3-NEXT:    s_cbranch_scc1 .LBB81_5
+; GCN3-NEXT:  ; %bb.4: ; %atomicrmw.private
 ; GCN3-NEXT:    s_cmp_lg_u64 s[34:35], 0
 ; GCN3-NEXT:    s_cselect_b32 s34, s34, -1
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s34
@@ -10796,7 +11090,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    v_xor_b32_e32 v4, s6, v0
 ; GCN3-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; GCN3-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT:  .LBB81_4: ; %atomicrmw.end
+; GCN3-NEXT:  .LBB81_5: ; %atomicrmw.end
 ; GCN3-NEXT:    s_waitcnt vmcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 4412e04e121a96..99ea3d56d3a237 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1550,10 +1550,13 @@ define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) {
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_nlt_f32_e64 s[0:1], s0, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GCN-NEXT:    v_cmp_nge_f32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN-NEXT:    v_cmp_nle_f32_e64 s[0:1], s2, 0
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mul_f32_e64 v0, -s2, v0
 ; GCN-NEXT:    v_cmp_le_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-NEXT:    s_endpgm
@@ -1563,12 +1566,15 @@ define amdgpu_kernel void @fnge_select_f32_multi_use_regression(float %.i2369) {
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    v_cmp_nle_f32_e64 s1, s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s0, 0
+; GFX11-NEXT:    v_mul_f32_e64 v0, -s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 038e7b4f5e2bb8..cfc1d4616a2c12 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -9,14 +9,16 @@ declare double @llvm.fabs.f64(double) #1
 define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isinf_pattern:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x204
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -27,9 +29,11 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x204
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -38,11 +42,12 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s4, 0x204
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -55,14 +60,16 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
 define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_not_isinf_pattern_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_nlg_f32_e64 s[4:5], |s4|, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -73,9 +80,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_nlg_f32_e64 s[2:3], |s4|, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -84,11 +93,12 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_nlg_f32_e64 s2, 0x7f800000, |s4|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
@@ -136,14 +146,16 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %
 define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -154,9 +166,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -165,11 +179,12 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s4, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
@@ -184,14 +199,16 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
 define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -202,9 +219,11 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -213,11 +232,12 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s4, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #3
@@ -234,10 +254,12 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_o_f32_e64 s[4:5], s4, s4
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -247,9 +269,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -258,11 +282,12 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s4, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
@@ -278,16 +303,18 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
 define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isfinite_not_pattern_1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s6, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_o_f32_e64 s[4:5], s6, s6
 ; SI-NEXT:    v_cmp_neq_f32_e32 vcc, s6, v0
 ; SI-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -300,9 +327,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
 ; VI-NEXT:    v_cmp_o_f32_e64 s[2:3], s4, s4
 ; VI-NEXT:    v_cmp_neq_f32_e32 vcc, s4, v0
 ; VI-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -311,13 +340,15 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s4, s4
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 s3, 0x7f800000, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
@@ -334,15 +365,17 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_o_f32_e64 s[4:5], s2, s2
+; SI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
+; SI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; SI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_cmp_o_f32_e64 s[0:1], s2, s2
-; SI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -354,22 +387,26 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ; VI-NEXT:    v_cmp_o_f32_e64 s[4:5], s2, s2
 ; VI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
 ; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_isfinite_not_pattern_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s2, s2
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
@@ -385,16 +422,18 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isfinite_not_pattern_3:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s6, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_u_f32_e64 s[4:5], s6, s6
 ; SI-NEXT:    v_cmp_neq_f32_e64 s[6:7], |s6|, v0
 ; SI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -407,9 +446,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 ; VI-NEXT:    v_cmp_u_f32_e64 s[2:3], s4, s4
 ; VI-NEXT:    v_cmp_neq_f32_e64 s[4:5], |s4|, v0
 ; VI-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -418,13 +459,15 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s2, s4, s4
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 s3, 0x7f800000, |s4|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp uno float %x, 0.000000e+00
@@ -439,14 +482,16 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_4:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -457,9 +502,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -468,11 +515,12 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s4, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
@@ -487,14 +535,16 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
 define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) nocapture %out, float %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_4_commute_and:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -505,9 +555,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -516,11 +568,12 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s4, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, 0.000000e+00
@@ -536,17 +589,19 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s0, s[2:3], 0x14
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
 ; SI-NEXT:    s_load_dword s1, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
 ; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], s1, v0
 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -561,9 +616,11 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 ; VI-NEXT:    v_cmp_class_f32_e32 vcc, s1, v0
 ; VI-NEXT:    v_cmp_o_f32_e64 s[0:1], s1, v1
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -573,13 +630,15 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x50
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s3, s4, 0x1f8
 ; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s4, s5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord float %x, %y
@@ -597,12 +656,12 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
-; SI-NEXT:    s_cmpk_eq_i32 s4, 0x7c00
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b32 s2, s4, 0x7fff
+; SI-NEXT:    s_cmpk_eq_i32 s2, 0x7c00
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -613,9 +672,11 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x204
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -624,11 +685,12 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f16_e64 s2, s4, 0x204
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %fabs = tail call half @llvm.fabs.f16(half %x) #1
@@ -644,15 +706,17 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    s_and_b32 s2, s4, 0x7fff
 ; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; SI-NEXT:    s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT:    s_cmpk_lg_i32 s2, 0x7c00
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -663,9 +727,11 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -674,11 +740,12 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f16_e64 s2, s4, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord half %x, 0.0
@@ -696,15 +763,17 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    s_and_b32 s2, s4, 0x7fff
 ; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; SI-NEXT:    s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT:    s_cmpk_lt_i32 s2, 0x7c00
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -715,9 +784,11 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -726,11 +797,12 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_class_f16_e64 s2, s4, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %ord = fcmp ord half %x, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 3571f3545ad1a1..68a4e7c3e8851c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -740,10 +740,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_eq_f32_e64 s[4:5], -1.0, s4
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -755,7 +757,9 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_f32_e64 s[4:5], -1.0, s4
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -790,10 +794,12 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_eq_f32_e64 s[4:5], -1.0, |s4|
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -805,7 +811,9 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_f32_e64 s[4:5], -1.0, |s4|
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index c6b4e129bacbe2..2c6ee7ea34bc8d 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -622,10 +622,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_eq_f32_e64 s[4:5], 1.0, s4
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -637,7 +639,9 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_f32_e64 s[4:5], 1.0, s4
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -672,10 +676,12 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_eq_f32_e64 s[4:5], 1.0, |s4|
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -687,7 +693,9 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_f32_e64 s[4:5], 1.0, |s4|
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 9e92a89501cf6b..6ab7c5ff97a65e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -592,11 +592,13 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, -1.0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -608,7 +610,9 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_f16_e64 s[4:5], -1.0, s4
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -618,12 +622,14 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -635,8 +641,10 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f16_e64 s2, -1.0, s4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 804208998f9e8f..a67d62f0d87f89 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -593,7 +593,9 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 1.0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -606,7 +608,9 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_f16_e64 s[4:5], 1.0, s4
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0 
+; VI-NEXT:    v_mov_b32_e32 v0, s4 
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -616,12 +620,14 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -634,8 +640,10 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f16_e64 s2, 1.0, s4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0 
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2 
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 
 ; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 7a18e2ef7b4a84..006853d2c9f529 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -97,59 +97,54 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_lshr_b32 s4, s7, 8
-; SI-NEXT:    s_and_b32 s5, s7, 0x1ff
-; SI-NEXT:    s_and_b32 s8, s4, 0xffe
-; SI-NEXT:    s_or_b32 s4, s5, s6
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT:    s_bfe_u32 s4, s7, 0xb0014
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_sub_i32 s6, 0x3f1, s4
-; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc10
-; SI-NEXT:    s_or_b32 s11, s8, s5
-; SI-NEXT:    v_med3_i32 v0, s6, 0, 13
-; SI-NEXT:    s_lshl_b32 s4, s10, 12
-; SI-NEXT:    s_or_b32 s5, s11, 0x1000
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
-; SI-NEXT:    s_or_b32 s4, s11, s4
-; SI-NEXT:    s_lshr_b32 s6, s5, s6
-; SI-NEXT:    v_lshl_b32_e32 v0, s6, v0
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_readfirstlane_b32 s5, v0
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_cmp_lt_i32 s10, 1
-; SI-NEXT:    s_cselect_b32 s6, s5, s4
-; SI-NEXT:    s_and_b32 s8, s6, 7
+; SI-NEXT:    s_lshr_b32 s0, s7, 8
+; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
+; SI-NEXT:    s_and_b32 s0, s0, 0xffe
+; SI-NEXT:    s_or_b32 s1, s1, s6
+; SI-NEXT:    s_cmp_lg_u32 s1, 0
+; SI-NEXT:    s_cselect_b32 s1, 1, 0
+; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
+; SI-NEXT:    s_or_b32 s6, s0, s1
+; SI-NEXT:    s_sub_i32 s0, 0x3f1, s2
+; SI-NEXT:    s_or_b32 s1, s6, 0x1000
+; SI-NEXT:    v_med3_i32 v0, s0, 0, 13
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    s_lshr_b32 s8, s1, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, s0
+; SI-NEXT:    s_cmp_lg_u32 s0, s1
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_addk_i32 s2, 0xfc10
+; SI-NEXT:    s_or_b32 s0, s8, s0
+; SI-NEXT:    s_lshl_b32 s1, s2, 12
+; SI-NEXT:    s_or_b32 s1, s6, s1
+; SI-NEXT:    s_cmp_lt_i32 s2, 1
+; SI-NEXT:    s_cselect_b32 s10, s0, s1
+; SI-NEXT:    s_and_b32 s8, s10, 7
 ; SI-NEXT:    s_cmp_gt_i32 s8, 5
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_cmp_eq_u32 s8, 3
 ; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    s_lshr_b32 s6, s6, 2
-; SI-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_addc_u32 s4, s6, 0
-; SI-NEXT:    s_cmp_lt_i32 s10, 31
-; SI-NEXT:    s_cselect_b32 s6, s4, 0x7c00
-; SI-NEXT:    s_cmp_lg_u32 s11, 0
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT:    s_cmpk_eq_i32 s10, 0x40f
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; SI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s4, s7, 16
-; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-NEXT:    s_and_b32 s4, s4, 0x8000
-; SI-NEXT:    v_or_b32_e32 v0, s4, v0
+; SI-NEXT:    s_lshr_b32 s10, s10, 2
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_addc_u32 s0, s10, 0
+; SI-NEXT:    s_cmp_lt_i32 s2, 31
+; SI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-NEXT:    s_cselect_b32 s1, 1, 0
+; SI-NEXT:    s_lshl_b32 s1, s1, 9
+; SI-NEXT:    s_or_b32 s1, s1, 0x7c00
+; SI-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s1, s0
+; SI-NEXT:    s_lshr_b32 s1, s7, 16
+; SI-NEXT:    s_and_b32 s1, s1, 0x8000
+; SI-NEXT:    s_or_b32 s6, s1, s0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -160,29 +155,26 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; VI-SAFE-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-SAFE-SDAG-NEXT:    s_mov_b32 s0, s4
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 8
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s6
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SAFE-SDAG-NEXT:    s_mov_b32 s1, s5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 8
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s5, s7, 0x1ff
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s5, s6
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s5, 0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s5
 ; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s7, 0xb0014
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s8, s4
 ; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s5
 ; VI-SAFE-SDAG-NEXT:    v_med3_i32 v0, s8, 0, 13
 ; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, 0x1000
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s8, s4, s8
-; VI-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v0, s8
-; VI-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s4, s8
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s9, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s4
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-SAFE-SDAG-NEXT:    s_add_i32 s10, s5, 0xfffffc10
-; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
 ; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s5, s10, 12
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s9, s4
 ; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s11, s4, s5
@@ -196,19 +188,17 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; VI-SAFE-SDAG-NEXT:    s_addc_u32 s4, s8, 0
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
 ; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s5, s5, 9
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s5, 0x7c00
 ; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
-; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; VI-SAFE-SDAG-NEXT:    v_mov_b32_e32 v1, s8
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s7, 16
-; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s4, s4, 0x8000
-; VI-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s4, v0
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s7, 16
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-SAFE-SDAG-NEXT:    s_endpgm
 ;
@@ -295,49 +285,45 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
-; GFX10-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX10-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s2, 12
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s7, s4, s7
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s5, s6
-; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX10-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s3, 0xb0014
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
+; GFX10-SAFE-SDAG-NEXT:    s_sub_i32 s6, 0x3f1, s5
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s2, 0x1000
+; GFX10-SAFE-SDAG-NEXT:    v_med3_i32 v0, s6, 0, 13
+; GFX10-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s4, s6
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s4
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s4, s7, s4
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s5, 12
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s2, s6
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s4, 7
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s4, 2
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-SAFE-SDAG-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX10-SAFE-SDAG-NEXT:    s_addc_u32 s4, s4, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s2, s2, 9
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s2, 0x7c00
+; GFX10-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s2, s3, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s2, s2, 0x8000
-; GFX10-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; GFX10-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX10-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX10-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX10-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX10-SAFE-SDAG-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX10-SAFE-SDAG-NEXT:    s_endpgm
@@ -425,58 +411,52 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
-; GFX11-SAFE-SDAG-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT:    v_med3_i32 v1, s5, 0, 13
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s7, s2, 12
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s7, s4, s7
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s6, s5, s6
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-SAFE-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s3, 0xb0014
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s4, s2
+; GFX11-SAFE-SDAG-NEXT:    s_sub_i32 s6, 0x3f1, s5
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s2, 0x1000
+; GFX11-SAFE-SDAG-NEXT:    v_med3_i32 v0, s6, 0, 13
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s7
+; GFX11-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s4, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s4
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s5, 0xfc10
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s4, s7, s4
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s5, 12
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s2, s6
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 1
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s6
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s4, 7
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s4, s4, 2
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-SAFE-SDAG-NEXT:    s_addc_u32 s5, s5, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX11-SAFE-SDAG-NEXT:    s_addc_u32 s4, s4, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s5, 31
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s2, s2, 9
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s2, 0x7c00
+; GFX11-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s2, s2, 0x8000
-; GFX11-SAFE-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 9, v0
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-SAFE-SDAG-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX11-SAFE-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-SAFE-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX11-SAFE-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-SAFE-SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 4f3086a9eb1f9a..0582ee20d11ba1 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -6,6 +6,34 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
 
 define double @v_sqrt_f64(double %x) {
+; SDAG-LABEL: v_sqrt_f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -38,6 +66,34 @@ define double @v_sqrt_f64(double %x) {
 }
 
 define double @v_sqrt_f64_fneg(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fneg:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 9
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_fneg:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -71,6 +127,34 @@ define double @v_sqrt_f64_fneg(double %x) {
 }
 
 define double @v_sqrt_f64_fabs(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fabs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -104,6 +188,34 @@ define double @v_sqrt_f64_fabs(double %x) {
 }
 
 define double @v_sqrt_f64_fneg_fabs(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 9
+; SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -138,6 +250,34 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 }
 
 define double @v_sqrt_f64_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -170,6 +310,34 @@ define double @v_sqrt_f64_ninf(double %x) {
 }
 
 define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
+; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,6 +370,34 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 }
 
 define double @v_sqrt_f64_nnan(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nnan:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -239,8 +435,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -308,8 +506,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -377,8 +577,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -446,8 +648,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -510,6 +714,34 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 }
 
 define double @v_sqrt_f64_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -542,6 +774,34 @@ define double @v_sqrt_f64_nsz(double %x) {
 }
 
 define double @v_sqrt_f64_nnan_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -574,6 +834,34 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 }
 
 define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -606,6 +894,34 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 }
 
 define double @v_sqrt_f64_afn(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -638,6 +954,34 @@ define double @v_sqrt_f64_afn(double %x) {
 }
 
 define double @v_sqrt_f64_afn_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_afn_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -770,6 +1114,34 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 }
 
 define double @v_sqrt_f64_afn_nnan(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -802,6 +1174,34 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 }
 
 define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,6 +1235,34 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -967,6 +1395,34 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -999,6 +1455,34 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 }
 
 define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
+; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1031,6 +1515,34 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 }
 
 define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
+; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1063,6 +1575,34 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 }
 
 define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..3f0ee4968d8c76 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -115,8 +115,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
-; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    s_endpgm
@@ -127,8 +127,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
-; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX8V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
@@ -139,8 +139,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_eq_u32 s2, s1
-; GFX9V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    s_endpgm
@@ -151,8 +151,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_eq_u32 s2, s1
-; GFX9V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
@@ -169,8 +169,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
-; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX8V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    s_endpgm
@@ -181,8 +181,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
-; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX8V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
@@ -193,8 +193,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX9V4-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_eq_u32 s2, s1
-; GFX9V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9V4-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    s_endpgm
@@ -205,8 +205,8 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX9V5-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_eq_u32 s2, s1
-; GFX9V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9V5-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 603f457f3e05e4..6f3d7331d1259c 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -882,39 +882,37 @@ define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %o
 ; GENERIC-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s2, s4, 0xfffffe00
-; GENERIC-NEXT:    s_cmp_eq_u32 s2, 1
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 2
-; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GENERIC-NEXT:    v_readfirstlane_b32 s4, v0
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 2
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 3
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 3
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 4
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 5
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 5
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 6
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 6
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 7
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 7
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 8
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 8
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 9
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 9
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 10
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 10
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 11
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 11
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 12
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 12
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 13
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 13
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 14
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 14
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 15
-; GENERIC-NEXT:    s_cmp_lg_u32 s2, 15
-; GENERIC-NEXT:    s_cselect_b32 s4, s4, 16
+; GENERIC-NEXT:    s_addk_i32 s4, 0xfe00
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT:    s_cselect_b32 s2, 1, 0
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 2
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 2
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 3
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 3
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 4
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 5
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 5
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 6
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 6
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 7
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 7
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 8
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 8
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 9
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 9
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 10
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 10
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 11
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 11
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 12
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 12
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 13
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 13
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 14
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 14
+; GENERIC-NEXT:    s_cselect_b32 s2, s2, 15
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 15
+; GENERIC-NEXT:    s_cselect_b32 s4, s2, 16
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    v_mov_b32_e32 v0, s4
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -3316,60 +3314,60 @@ define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addr
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
-; GENERIC-NEXT:    s_add_i32 s6, s4, 0xfffffe00
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 0
-; GENERIC-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 3
-; GENERIC-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 3
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 2
-; GENERIC-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GENERIC-NEXT:    s_cselect_b32 s5, 16, 2
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 1
-; GENERIC-NEXT:    v_mov_b32_e32 v3, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 1
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 7
-; GENERIC-NEXT:    v_mov_b32_e32 v2, s5
+; GENERIC-NEXT:    s_addk_i32 s4, 0xfe00
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 1
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 2
+; GENERIC-NEXT:    s_cselect_b32 s6, 16, 2
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 3
+; GENERIC-NEXT:    v_mov_b32_e32 v1, s5
+; GENERIC-NEXT:    s_cselect_b32 s5, 16, 3
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 0
+; GENERIC-NEXT:    v_mov_b32_e32 v2, s6
+; GENERIC-NEXT:    s_cselect_b32 s6, 1, 0
+; GENERIC-NEXT:    v_mov_b32_e32 v3, s5
+; GENERIC-NEXT:    s_lshl_b32 s5, s6, 4
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 7
+; GENERIC-NEXT:    v_mov_b32_e32 v0, s5
 ; GENERIC-NEXT:    s_cselect_b32 s5, 16, 7
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 6
-; GENERIC-NEXT:    v_mov_b32_e32 v1, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 6
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 5
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 6
+; GENERIC-NEXT:    s_cselect_b32 s6, 16, 6
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 5
 ; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
 ; GENERIC-NEXT:    s_cselect_b32 s5, 16, 5
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 4
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 4
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 11
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 4
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s6
+; GENERIC-NEXT:    s_cselect_b32 s6, 16, 4
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 11
 ; GENERIC-NEXT:    v_mov_b32_e32 v5, s5
 ; GENERIC-NEXT:    s_cselect_b32 s5, 16, 11
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 10
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s6
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 10
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 9
+; GENERIC-NEXT:    s_cselect_b32 s6, 16, 10
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 9
 ; GENERIC-NEXT:    s_waitcnt expcnt(0)
 ; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
 ; GENERIC-NEXT:    s_cselect_b32 s5, 16, 9
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 8
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 8
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 15
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 8
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s6
+; GENERIC-NEXT:    s_cselect_b32 s6, 16, 8
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 15
 ; GENERIC-NEXT:    v_mov_b32_e32 v5, s5
 ; GENERIC-NEXT:    s_cselect_b32 s5, 16, 15
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 14
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
-; GENERIC-NEXT:    s_cselect_b32 s4, 16, 14
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 13
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 14
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s6
+; GENERIC-NEXT:    s_cselect_b32 s6, 16, 14
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 13
 ; GENERIC-NEXT:    s_cselect_b32 s7, 16, 13
-; GENERIC-NEXT:    s_cmp_eq_u32 s6, 12
-; GENERIC-NEXT:    s_cselect_b32 s6, 16, 12
+; GENERIC-NEXT:    s_cmp_eq_u32 s4, 12
+; GENERIC-NEXT:    s_cselect_b32 s4, 16, 12
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
 ; GENERIC-NEXT:    s_waitcnt expcnt(0)
 ; GENERIC-NEXT:    v_mov_b32_e32 v7, s5
-; GENERIC-NEXT:    v_mov_b32_e32 v6, s4
+; GENERIC-NEXT:    v_mov_b32_e32 v6, s6
 ; GENERIC-NEXT:    v_mov_b32_e32 v5, s7
-; GENERIC-NEXT:    v_mov_b32_e32 v4, s6
+; GENERIC-NEXT:    v_mov_b32_e32 v4, s4
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_endpgm
@@ -7230,7 +7228,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; GENERIC-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    s_cmp_lg_u32 s0, 0
-; GENERIC-NEXT:    s_cbranch_scc0 .LBB19_4
+; GENERIC-NEXT:    s_cbranch_scc0 .LBB19_2
 ; GENERIC-NEXT:  ; %bb.1: ; %bb4
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
@@ -7239,9 +7237,16 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    ; reg use v[0:3]
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:    s_mov_b64 vcc, exec
-; GENERIC-NEXT:    s_cbranch_execnz .LBB19_3
-; GENERIC-NEXT:  .LBB19_2: ; %bb1
+; GENERIC-NEXT:    s_mov_b64 s[0:1], 0
+; GENERIC-NEXT:    s_branch .LBB19_3
+; GENERIC-NEXT:  .LBB19_2:
+; GENERIC-NEXT:    s_mov_b64 s[0:1], -1
+; GENERIC-NEXT:  .LBB19_3: ; %Flow
+; GENERIC-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GENERIC-NEXT:    s_cselect_b32 s0, 1, 0
+; GENERIC-NEXT:    s_cmp_lg_u32 s0, 1
+; GENERIC-NEXT:    s_cbranch_scc1 .LBB19_5
+; GENERIC-NEXT:  ; %bb.4: ; %bb1
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
@@ -7249,15 +7254,12 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    ; reg use v[0:3]
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:  .LBB19_3: ; %bb7
+; GENERIC-NEXT:  .LBB19_5: ; %bb7
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:    s_endpgm
-; GENERIC-NEXT:  .LBB19_4:
-; GENERIC-NEXT:    s_mov_b64 vcc, 0
-; GENERIC-NEXT:    s_branch .LBB19_2
 ;
 ; NOOPT-LABEL: extract_adjacent_blocks:
 ; NOOPT:       ; %bb.0: ; %bb
@@ -7290,12 +7292,11 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
 ; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
-; NOOPT-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; NOOPT-NEXT:    s_mov_b32 s0, 1
-; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
-; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; NOOPT-NEXT:    s_cbranch_vccnz .LBB19_4
+; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; NOOPT-NEXT:    s_cselect_b32 s0, 1, 0
+; NOOPT-NEXT:    s_mov_b32 s1, 1
+; NOOPT-NEXT:    s_cmp_lg_u32 s0, s1
+; NOOPT-NEXT:    s_cbranch_scc1 .LBB19_4
 ; NOOPT-NEXT:  ; %bb.2: ; %bb1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s6, s1
@@ -7366,7 +7367,7 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-MOVREL-NEXT:    s_cmp_lg_u32 s0, 0
-; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB19_4
+; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB19_2
 ; SI-MOVREL-NEXT:  ; %bb.1: ; %bb4
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
@@ -7375,8 +7376,16 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; SI-MOVREL-NEXT:    ;;#ASMSTART
 ; SI-MOVREL-NEXT:    ; reg use v[0:3]
 ; SI-MOVREL-NEXT:    ;;#ASMEND
-; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB19_3
-; SI-MOVREL-NEXT:  .LBB19_2: ; %bb1
+; SI-MOVREL-NEXT:    s_mov_b64 s[0:1], 0
+; SI-MOVREL-NEXT:    s_branch .LBB19_3
+; SI-MOVREL-NEXT:  .LBB19_2:
+; SI-MOVREL-NEXT:    s_mov_b64 s[0:1], -1
+; SI-MOVREL-NEXT:  .LBB19_3: ; %Flow
+; SI-MOVREL-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-MOVREL-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-MOVREL-NEXT:    s_cmp_lg_u32 s0, 1
+; SI-MOVREL-NEXT:    s_cbranch_scc1 .LBB19_5
+; SI-MOVREL-NEXT:  ; %bb.4: ; %bb1
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
 ; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
@@ -7384,66 +7393,76 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
 ; SI-MOVREL-NEXT:    ;;#ASMSTART
 ; SI-MOVREL-NEXT:    ; reg use v[0:3]
 ; SI-MOVREL-NEXT:    ;;#ASMEND
-; SI-MOVREL-NEXT:  .LBB19_3: ; %bb7
+; SI-MOVREL-NEXT:  .LBB19_5: ; %bb7
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
 ; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-MOVREL-NEXT:    s_endpgm
-; SI-MOVREL-NEXT:  .LBB19_4:
-; SI-MOVREL-NEXT:    s_branch .LBB19_2
 ;
 ; VI-LABEL: extract_adjacent_blocks:
 ; VI:       ; %bb.0: ; %bb
 ; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB19_4
+; VI-NEXT:    s_cbranch_scc0 .LBB19_2
 ; VI-NEXT:  ; %bb.1: ; %bb4
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ; reg use v[0:3]
 ; VI-NEXT:    ;;#ASMEND
-; VI-NEXT:    s_cbranch_execnz .LBB19_3
-; VI-NEXT:  .LBB19_2: ; %bb1
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    s_branch .LBB19_3
+; VI-NEXT:  .LBB19_2:
+; VI-NEXT:    s_mov_b64 s[0:1], -1
+; VI-NEXT:  .LBB19_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s0, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB19_5
+; VI-NEXT:  ; %bb.4: ; %bb1
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ; reg use v[0:3]
 ; VI-NEXT:    ;;#ASMEND
-; VI-NEXT:  .LBB19_3: ; %bb7
+; VI-NEXT:  .LBB19_5: ; %bb7
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB19_4:
-; VI-NEXT:    s_branch .LBB19_2
 ;
 ; GFX9-IDXMODE-LABEL: extract_adjacent_blocks:
 ; GFX9-IDXMODE:       ; %bb.0: ; %bb
 ; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB19_4
+; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB19_2
 ; GFX9-IDXMODE-NEXT:  ; %bb.1: ; %bb4
 ; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
 ; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
 ; GFX9-IDXMODE-NEXT:    ;;#ASMEND
-; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB19_3
-; GFX9-IDXMODE-NEXT:  .LBB19_2: ; %bb1
+; GFX9-IDXMODE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-IDXMODE-NEXT:    s_branch .LBB19_3
+; GFX9-IDXMODE-NEXT:  .LBB19_2:
+; GFX9-IDXMODE-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-IDXMODE-NEXT:  .LBB19_3: ; %Flow
+; GFX9-IDXMODE-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-IDXMODE-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-IDXMODE-NEXT:    s_cbranch_scc1 .LBB19_5
+; GFX9-IDXMODE-NEXT:  ; %bb.4: ; %bb1
 ; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
 ; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
 ; GFX9-IDXMODE-NEXT:    ;;#ASMEND
-; GFX9-IDXMODE-NEXT:  .LBB19_3: ; %bb7
+; GFX9-IDXMODE-NEXT:  .LBB19_5: ; %bb7
 ; GFX9-IDXMODE-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    s_endpgm
-; GFX9-IDXMODE-NEXT:  .LBB19_4:
-; GFX9-IDXMODE-NEXT:    s_branch .LBB19_2
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -7472,7 +7491,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; GENERIC-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
 ; GENERIC-NEXT:    s_cmp_lg_u32 s0, 0
-; GENERIC-NEXT:    s_cbranch_scc0 .LBB20_4
+; GENERIC-NEXT:    s_cbranch_scc0 .LBB20_2
 ; GENERIC-NEXT:  ; %bb.1: ; %bb4
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
@@ -7481,9 +7500,16 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    ; reg use v[0:3]
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:    s_mov_b64 vcc, exec
-; GENERIC-NEXT:    s_cbranch_execnz .LBB20_3
-; GENERIC-NEXT:  .LBB20_2: ; %bb1
+; GENERIC-NEXT:    s_mov_b64 s[0:1], 0
+; GENERIC-NEXT:    s_branch .LBB20_3
+; GENERIC-NEXT:  .LBB20_2:
+; GENERIC-NEXT:    s_mov_b64 s[0:1], -1
+; GENERIC-NEXT:  .LBB20_3: ; %Flow
+; GENERIC-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GENERIC-NEXT:    s_cselect_b32 s0, 1, 0
+; GENERIC-NEXT:    s_cmp_lg_u32 s0, 1
+; GENERIC-NEXT:    s_cbranch_scc1 .LBB20_5
+; GENERIC-NEXT:  ; %bb.4: ; %bb1
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
@@ -7491,15 +7517,12 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; GENERIC-NEXT:    ;;#ASMSTART
 ; GENERIC-NEXT:    ; reg use v[0:3]
 ; GENERIC-NEXT:    ;;#ASMEND
-; GENERIC-NEXT:  .LBB20_3: ; %bb7
+; GENERIC-NEXT:  .LBB20_5: ; %bb7
 ; GENERIC-NEXT:    s_mov_b32 s3, 0xf000
 ; GENERIC-NEXT:    s_mov_b32 s2, -1
 ; GENERIC-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GENERIC-NEXT:    s_waitcnt vmcnt(0)
 ; GENERIC-NEXT:    s_endpgm
-; GENERIC-NEXT:  .LBB20_4:
-; GENERIC-NEXT:    s_mov_b64 vcc, 0
-; GENERIC-NEXT:    s_branch .LBB20_2
 ;
 ; NOOPT-LABEL: insert_adjacent_blocks:
 ; NOOPT:       ; %bb.0: ; %bb
@@ -7534,12 +7557,11 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    v_readlane_b32 s0, v4, 0
 ; NOOPT-NEXT:    v_readlane_b32 s1, v4, 1
-; NOOPT-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; NOOPT-NEXT:    s_mov_b32 s0, 1
-; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v0, s0
-; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; NOOPT-NEXT:    s_cbranch_vccnz .LBB20_4
+; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; NOOPT-NEXT:    s_cselect_b32 s0, 1, 0
+; NOOPT-NEXT:    s_mov_b32 s1, 1
+; NOOPT-NEXT:    s_cmp_lg_u32 s0, s1
+; NOOPT-NEXT:    s_cbranch_scc1 .LBB20_4
 ; NOOPT-NEXT:  ; %bb.2: ; %bb1
 ; NOOPT-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; NOOPT-NEXT:    s_mov_b32 s6, s1
@@ -7615,7 +7637,7 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; SI-MOVREL-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-MOVREL-NEXT:    s_cmp_lg_u32 s0, 0
-; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB20_4
+; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB20_2
 ; SI-MOVREL-NEXT:  ; %bb.1: ; %bb4
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
@@ -7624,8 +7646,16 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; SI-MOVREL-NEXT:    ;;#ASMSTART
 ; SI-MOVREL-NEXT:    ; reg use v[0:3]
 ; SI-MOVREL-NEXT:    ;;#ASMEND
-; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB20_3
-; SI-MOVREL-NEXT:  .LBB20_2: ; %bb1
+; SI-MOVREL-NEXT:    s_mov_b64 s[0:1], 0
+; SI-MOVREL-NEXT:    s_branch .LBB20_3
+; SI-MOVREL-NEXT:  .LBB20_2:
+; SI-MOVREL-NEXT:    s_mov_b64 s[0:1], -1
+; SI-MOVREL-NEXT:  .LBB20_3: ; %Flow
+; SI-MOVREL-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-MOVREL-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-MOVREL-NEXT:    s_cmp_lg_u32 s0, 1
+; SI-MOVREL-NEXT:    s_cbranch_scc1 .LBB20_5
+; SI-MOVREL-NEXT:  ; %bb.4: ; %bb1
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
 ; SI-MOVREL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
@@ -7633,66 +7663,76 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
 ; SI-MOVREL-NEXT:    ;;#ASMSTART
 ; SI-MOVREL-NEXT:    ; reg use v[0:3]
 ; SI-MOVREL-NEXT:    ;;#ASMEND
-; SI-MOVREL-NEXT:  .LBB20_3: ; %bb7
+; SI-MOVREL-NEXT:  .LBB20_5: ; %bb7
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
 ; SI-MOVREL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-MOVREL-NEXT:    s_endpgm
-; SI-MOVREL-NEXT:  .LBB20_4:
-; SI-MOVREL-NEXT:    s_branch .LBB20_2
 ;
 ; VI-LABEL: insert_adjacent_blocks:
 ; VI:       ; %bb.0: ; %bb
 ; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB20_4
+; VI-NEXT:    s_cbranch_scc0 .LBB20_2
 ; VI-NEXT:  ; %bb.1: ; %bb4
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ; reg use v[0:3]
 ; VI-NEXT:    ;;#ASMEND
-; VI-NEXT:    s_cbranch_execnz .LBB20_3
-; VI-NEXT:  .LBB20_2: ; %bb1
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    s_branch .LBB20_3
+; VI-NEXT:  .LBB20_2:
+; VI-NEXT:    s_mov_b64 s[0:1], -1
+; VI-NEXT:  .LBB20_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s0, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB20_5
+; VI-NEXT:  ; %bb.4: ; %bb1
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ; reg use v[0:3]
 ; VI-NEXT:    ;;#ASMEND
-; VI-NEXT:  .LBB20_3: ; %bb7
+; VI-NEXT:  .LBB20_5: ; %bb7
 ; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB20_4:
-; VI-NEXT:    s_branch .LBB20_2
 ;
 ; GFX9-IDXMODE-LABEL: insert_adjacent_blocks:
 ; GFX9-IDXMODE:       ; %bb.0: ; %bb
 ; GFX9-IDXMODE-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB20_4
+; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB20_2
 ; GFX9-IDXMODE-NEXT:  ; %bb.1: ; %bb4
 ; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
 ; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
 ; GFX9-IDXMODE-NEXT:    ;;#ASMEND
-; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB20_3
-; GFX9-IDXMODE-NEXT:  .LBB20_2: ; %bb1
+; GFX9-IDXMODE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-IDXMODE-NEXT:    s_branch .LBB20_3
+; GFX9-IDXMODE-NEXT:  .LBB20_2:
+; GFX9-IDXMODE-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-IDXMODE-NEXT:  .LBB20_3: ; %Flow
+; GFX9-IDXMODE-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-IDXMODE-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-IDXMODE-NEXT:    s_cbranch_scc1 .LBB20_5
+; GFX9-IDXMODE-NEXT:  ; %bb.4: ; %bb1
 ; GFX9-IDXMODE-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    ;;#ASMSTART
 ; GFX9-IDXMODE-NEXT:    ; reg use v[0:3]
 ; GFX9-IDXMODE-NEXT:    ;;#ASMEND
-; GFX9-IDXMODE-NEXT:  .LBB20_3: ; %bb7
+; GFX9-IDXMODE-NEXT:  .LBB20_5: ; %bb7
 ; GFX9-IDXMODE-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    s_endpgm
-; GFX9-IDXMODE-NEXT:  .LBB20_4:
-; GFX9-IDXMODE-NEXT:    s_branch .LBB20_2
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -9037,8 +9077,10 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GENERIC-NEXT:    s_branch .LBB26_2
 ; GENERIC-NEXT:  .LBB26_1: ; %Flow
 ; GENERIC-NEXT:    ; in Loop: Header=BB26_2 Depth=1
-; GENERIC-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GENERIC-NEXT:    s_cbranch_vccz .LBB26_4
+; GENERIC-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GENERIC-NEXT:    s_cselect_b32 s4, 1, 0
+; GENERIC-NEXT:    s_cmp_lg_u32 s4, 1
+; GENERIC-NEXT:    s_cbranch_scc0 .LBB26_4
 ; GENERIC-NEXT:  .LBB26_2: ; %bb2
 ; GENERIC-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GENERIC-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9291,14 +9333,13 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; NOOPT-NEXT:    v_readlane_b32 s0, v18, 2
 ; NOOPT-NEXT:    v_readlane_b32 s1, v18, 3
 ; NOOPT-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
-; NOOPT-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; NOOPT-NEXT:    s_mov_b32 s0, 1
-; NOOPT-NEXT:    ; implicit-def: $sgpr1
-; NOOPT-NEXT:    v_cmp_ne_u32_e64 s[0:1], v1, s0
-; NOOPT-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; NOOPT-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; NOOPT-NEXT:    s_cselect_b32 s0, 1, 0
+; NOOPT-NEXT:    s_mov_b32 s1, 1
+; NOOPT-NEXT:    s_cmp_lg_u32 s0, s1
 ; NOOPT-NEXT:    s_waitcnt vmcnt(0)
 ; NOOPT-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
-; NOOPT-NEXT:    s_cbranch_vccnz .LBB26_1
+; NOOPT-NEXT:    s_cbranch_scc1 .LBB26_1
 ; NOOPT-NEXT:  ; %bb.7: ; %bb8
 ; NOOPT-NEXT:    s_endpgm
 ;
@@ -9308,24 +9349,30 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
 ; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
-; SI-MOVREL-NEXT:    s_branch .LBB26_2
-; SI-MOVREL-NEXT:  .LBB26_1:
+; SI-MOVREL-NEXT:    s_branch .LBB26_3
+; SI-MOVREL-NEXT:  .LBB26_1: ; in Loop: Header=BB26_3 Depth=1
+; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0
-; SI-MOVREL-NEXT:    s_branch .LBB26_6
-; SI-MOVREL-NEXT:  .LBB26_2: ; %bb2
+; SI-MOVREL-NEXT:  .LBB26_2: ; %Flow
+; SI-MOVREL-NEXT:    ; in Loop: Header=BB26_3 Depth=1
+; SI-MOVREL-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-MOVREL-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-MOVREL-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-MOVREL-NEXT:    s_cbranch_scc0 .LBB26_7
+; SI-MOVREL-NEXT:  .LBB26_3: ; %bb2
 ; SI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
-; SI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
+; SI-MOVREL-NEXT:    ; Child Loop BB26_5 Depth 2
 ; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; SI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
-; SI-MOVREL-NEXT:  ; %bb.3: ; %bb4
-; SI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
+; SI-MOVREL-NEXT:  ; %bb.4: ; %bb4
+; SI-MOVREL-NEXT:    ; in Loop: Header=BB26_3 Depth=1
 ; SI-MOVREL-NEXT:    buffer_load_dword v16, off, s[0:3], 0 glc
 ; SI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], exec
 ; SI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; SI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; SI-MOVREL-NEXT:  .LBB26_5: ; Parent Loop BB26_3 Depth=1
 ; SI-MOVREL-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-MOVREL-NEXT:    v_readfirstlane_b32 s6, v16
 ; SI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
@@ -9333,35 +9380,42 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; SI-MOVREL-NEXT:    s_mov_b32 m0, s6
 ; SI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v17
 ; SI-MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_4
-; SI-MOVREL-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_5
+; SI-MOVREL-NEXT:  ; %bb.6: ; in Loop: Header=BB26_3 Depth=1
 ; SI-MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_2
-; SI-MOVREL-NEXT:  .LBB26_6: ; %bb8
+; SI-MOVREL-NEXT:    s_mov_b64 s[4:5], 0
+; SI-MOVREL-NEXT:    s_branch .LBB26_2
+; SI-MOVREL-NEXT:  .LBB26_7: ; %bb8
 ; SI-MOVREL-NEXT:    s_endpgm
 ;
 ; VI-MOVREL-LABEL: broken_phi_bb:
 ; VI-MOVREL:       ; %bb.0: ; %bb
 ; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 8
-; VI-MOVREL-NEXT:    s_branch .LBB26_2
-; VI-MOVREL-NEXT:  .LBB26_1:
+; VI-MOVREL-NEXT:    s_branch .LBB26_3
+; VI-MOVREL-NEXT:  .LBB26_1: ; in Loop: Header=BB26_3 Depth=1
+; VI-MOVREL-NEXT:    s_mov_b64 s[2:3], -1
 ; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0
-; VI-MOVREL-NEXT:    s_branch .LBB26_6
-; VI-MOVREL-NEXT:  .LBB26_2: ; %bb2
+; VI-MOVREL-NEXT:  .LBB26_2: ; %Flow
+; VI-MOVREL-NEXT:    ; in Loop: Header=BB26_3 Depth=1
+; VI-MOVREL-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-MOVREL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-MOVREL-NEXT:    s_cmp_lg_u32 s2, 1
+; VI-MOVREL-NEXT:    s_cbranch_scc0 .LBB26_7
+; VI-MOVREL-NEXT:  .LBB26_3: ; %bb2
 ; VI-MOVREL-NEXT:    ; =>This Loop Header: Depth=1
-; VI-MOVREL-NEXT:    ; Child Loop BB26_4 Depth 2
+; VI-MOVREL-NEXT:    ; Child Loop BB26_5 Depth 2
 ; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-MOVREL-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; VI-MOVREL-NEXT:    s_cbranch_vccnz .LBB26_1
-; VI-MOVREL-NEXT:  ; %bb.3: ; %bb4
-; VI-MOVREL-NEXT:    ; in Loop: Header=BB26_2 Depth=1
+; VI-MOVREL-NEXT:  ; %bb.4: ; %bb4
+; VI-MOVREL-NEXT:    ; in Loop: Header=BB26_3 Depth=1
 ; VI-MOVREL-NEXT:    flat_load_dword v16, v[0:1] glc
 ; VI-MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-MOVREL-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-MOVREL-NEXT:    s_mov_b64 s[2:3], exec
 ; VI-MOVREL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; VI-MOVREL-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; VI-MOVREL-NEXT:  .LBB26_5: ; Parent Loop BB26_3 Depth=1
 ; VI-MOVREL-NEXT:    ; => This Inner Loop Header: Depth=2
 ; VI-MOVREL-NEXT:    v_readfirstlane_b32 s4, v16
 ; VI-MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v16
@@ -9369,35 +9423,42 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-MOVREL-NEXT:    s_mov_b32 m0, s4
 ; VI-MOVREL-NEXT:    v_movreld_b32_e32 v0, v17
 ; VI-MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; VI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_4
-; VI-MOVREL-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; VI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_5
+; VI-MOVREL-NEXT:  ; %bb.6: ; in Loop: Header=BB26_3 Depth=1
 ; VI-MOVREL-NEXT:    s_mov_b64 exec, s[2:3]
-; VI-MOVREL-NEXT:    s_cbranch_execnz .LBB26_2
-; VI-MOVREL-NEXT:  .LBB26_6: ; %bb8
+; VI-MOVREL-NEXT:    s_mov_b64 s[2:3], 0
+; VI-MOVREL-NEXT:    s_branch .LBB26_2
+; VI-MOVREL-NEXT:  .LBB26_7: ; %bb8
 ; VI-MOVREL-NEXT:    s_endpgm
 ;
 ; VI-IDXMODE-LABEL: broken_phi_bb:
 ; VI-IDXMODE:       ; %bb.0: ; %bb
 ; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
-; VI-IDXMODE-NEXT:    s_branch .LBB26_2
-; VI-IDXMODE-NEXT:  .LBB26_1:
+; VI-IDXMODE-NEXT:    s_branch .LBB26_3
+; VI-IDXMODE-NEXT:  .LBB26_1: ; in Loop: Header=BB26_3 Depth=1
+; VI-IDXMODE-NEXT:    s_mov_b64 s[2:3], -1
 ; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0
-; VI-IDXMODE-NEXT:    s_branch .LBB26_6
-; VI-IDXMODE-NEXT:  .LBB26_2: ; %bb2
+; VI-IDXMODE-NEXT:  .LBB26_2: ; %Flow
+; VI-IDXMODE-NEXT:    ; in Loop: Header=BB26_3 Depth=1
+; VI-IDXMODE-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-IDXMODE-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-IDXMODE-NEXT:    s_cmp_lg_u32 s2, 1
+; VI-IDXMODE-NEXT:    s_cbranch_scc0 .LBB26_7
+; VI-IDXMODE-NEXT:  .LBB26_3: ; %bb2
 ; VI-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
-; VI-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
+; VI-IDXMODE-NEXT:    ; Child Loop BB26_5 Depth 2
 ; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; VI-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
-; VI-IDXMODE-NEXT:  ; %bb.3: ; %bb4
-; VI-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
+; VI-IDXMODE-NEXT:  ; %bb.4: ; %bb4
+; VI-IDXMODE-NEXT:    ; in Loop: Header=BB26_3 Depth=1
 ; VI-IDXMODE-NEXT:    flat_load_dword v16, v[0:1] glc
 ; VI-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; VI-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
 ; VI-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; VI-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; VI-IDXMODE-NEXT:  .LBB26_5: ; Parent Loop BB26_3 Depth=1
 ; VI-IDXMODE-NEXT:    ; => This Inner Loop Header: Depth=2
 ; VI-IDXMODE-NEXT:    v_readfirstlane_b32 s4, v16
 ; VI-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v16
@@ -9406,35 +9467,42 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
 ; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
 ; VI-IDXMODE-NEXT:    s_xor_b64 exec, exec, vcc
-; VI-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_4
-; VI-IDXMODE-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; VI-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_5
+; VI-IDXMODE-NEXT:  ; %bb.6: ; in Loop: Header=BB26_3 Depth=1
 ; VI-IDXMODE-NEXT:    s_mov_b64 exec, s[2:3]
-; VI-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_2
-; VI-IDXMODE-NEXT:  .LBB26_6: ; %bb8
+; VI-IDXMODE-NEXT:    s_mov_b64 s[2:3], 0
+; VI-IDXMODE-NEXT:    s_branch .LBB26_2
+; VI-IDXMODE-NEXT:  .LBB26_7: ; %bb8
 ; VI-IDXMODE-NEXT:    s_endpgm
 ;
 ; GFX9-IDXMODE-LABEL: broken_phi_bb:
 ; GFX9-IDXMODE:       ; %bb.0: ; %bb
 ; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 8
-; GFX9-IDXMODE-NEXT:    s_branch .LBB26_2
-; GFX9-IDXMODE-NEXT:  .LBB26_1:
+; GFX9-IDXMODE-NEXT:    s_branch .LBB26_3
+; GFX9-IDXMODE-NEXT:  .LBB26_1: ; in Loop: Header=BB26_3 Depth=1
+; GFX9-IDXMODE-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0
-; GFX9-IDXMODE-NEXT:    s_branch .LBB26_6
-; GFX9-IDXMODE-NEXT:  .LBB26_2: ; %bb2
+; GFX9-IDXMODE-NEXT:  .LBB26_2: ; %Flow
+; GFX9-IDXMODE-NEXT:    ; in Loop: Header=BB26_3 Depth=1
+; GFX9-IDXMODE-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-IDXMODE-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-IDXMODE-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX9-IDXMODE-NEXT:    s_cbranch_scc0 .LBB26_7
+; GFX9-IDXMODE-NEXT:  .LBB26_3: ; %bb2
 ; GFX9-IDXMODE-NEXT:    ; =>This Loop Header: Depth=1
-; GFX9-IDXMODE-NEXT:    ; Child Loop BB26_4 Depth 2
+; GFX9-IDXMODE-NEXT:    ; Child Loop BB26_5 Depth 2
 ; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-IDXMODE-NEXT:    v_cmp_le_i32_e32 vcc, s0, v0
 ; GFX9-IDXMODE-NEXT:    s_cbranch_vccnz .LBB26_1
-; GFX9-IDXMODE-NEXT:  ; %bb.3: ; %bb4
-; GFX9-IDXMODE-NEXT:    ; in Loop: Header=BB26_2 Depth=1
+; GFX9-IDXMODE-NEXT:  ; %bb.4: ; %bb4
+; GFX9-IDXMODE-NEXT:    ; in Loop: Header=BB26_3 Depth=1
 ; GFX9-IDXMODE-NEXT:    global_load_dword v16, v[0:1], off glc
 ; GFX9-IDXMODE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX9-IDXMODE-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-IDXMODE-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX9-IDXMODE-NEXT:  .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; GFX9-IDXMODE-NEXT:  .LBB26_5: ; Parent Loop BB26_3 Depth=1
 ; GFX9-IDXMODE-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX9-IDXMODE-NEXT:    v_readfirstlane_b32 s4, v16
 ; GFX9-IDXMODE-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v16
@@ -9443,11 +9511,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
 ; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v17
 ; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
 ; GFX9-IDXMODE-NEXT:    s_xor_b64 exec, exec, vcc
-; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_4
-; GFX9-IDXMODE-NEXT:  ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_5
+; GFX9-IDXMODE-NEXT:  ; %bb.6: ; in Loop: Header=BB26_3 Depth=1
 ; GFX9-IDXMODE-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-IDXMODE-NEXT:    s_cbranch_execnz .LBB26_2
-; GFX9-IDXMODE-NEXT:  .LBB26_6: ; %bb8
+; GFX9-IDXMODE-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-IDXMODE-NEXT:    s_branch .LBB26_2
+; GFX9-IDXMODE-NEXT:  .LBB26_7: ; %bb8
 ; GFX9-IDXMODE-NEXT:    s_endpgm
 bb:
   br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index e7c77d3123e825..16758df7059f8d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -52,7 +52,7 @@ bb:
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
 ; GFX11-LABEL: f2:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s21, s[2:3], 0x24
+; GFX11-NEXT:    s_load_b32 s22, s[2:3], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX11-NEXT:    s_mov_b32 s12, s13
 ; GFX11-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -63,7 +63,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_mov_b32 s20, exec_lo
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v0, s21, v0
+; GFX11-NEXT:    v_mul_lo_u32 v0, s22, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-NEXT:    s_cbranch_execz .LBB2_13
@@ -71,7 +71,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_load_b128 s[16:19], s[2:3], 0x2c
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_bitcmp1_b32 s17, 0
-; GFX11-NEXT:    s_cselect_b32 s22, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s21, -1, 0
 ; GFX11-NEXT:    s_bitcmp0_b32 s17, 0
 ; GFX11-NEXT:    s_mov_b32 s17, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
@@ -93,8 +93,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:  .LBB2_3: ; %Flow10
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB2_12
 ; GFX11-NEXT:  ; %bb.4: ; %bb16
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x54
 ; GFX11-NEXT:    s_bitcmp1_b32 s19, 0
@@ -108,58 +111,57 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
 ; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
 ; GFX11-NEXT:    s_load_b128 s[24:27], s[2:3], 0x44
+; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mul_hi_u32 s0, s25, s24
 ; GFX11-NEXT:    s_mul_i32 s1, s25, s24
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s9, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s26
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s0, s0, s26
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s18
-; GFX11-NEXT:    s_mul_i32 s0, s0, s16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s21, s0
+; GFX11-NEXT:    s_mul_i32 s0, s0, s16
+; GFX11-NEXT:    s_or_b32 s0, s22, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX11-NEXT:    s_mov_b32 s0, s1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[18:19]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s22
+; GFX11-NEXT:    global_load_u16 v0, v0, s[18:19]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    s_and_b32 s13, s1, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
 ; GFX11-NEXT:    s_and_b32 s1, s7, s1
+; GFX11-NEXT:    s_and_b32 s16, s21, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s13, s9
-; GFX11-NEXT:    s_and_b32 s9, 0xffff, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s13, s16
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s9
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_and_b32 s16, s8, exec_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s9
-; GFX11-NEXT:    v_readfirstlane_b32 s9, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX11-NEXT:    s_cselect_b32 s9, s13, s9
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_and_b32 s16, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX11-NEXT:    s_and_b32 s18, s8, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s13, s13, s16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_bitcmp1_b32 s9, 0
-; GFX11-NEXT:    s_cselect_b32 s9, 0x100, 0
-; GFX11-NEXT:    s_or_b32 s0, s9, s0
+; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
+; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
+; GFX11-NEXT:    s_or_b32 s9, s13, s9
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
 ; GFX11-NEXT:  ; %bb.7: ; %Flow
 ; GFX11-NEXT:    s_mov_b32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 213813a94fc859..f35f44b02c7f9b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1969,17 +1969,22 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB42_4
+; SI-NEXT:    s_cbranch_scc0 .LBB42_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_load_dword s5, s[2:3], 0x1
 ; SI-NEXT:    s_mov_b64 s[6:7], 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; SI-NEXT:    s_branch .LBB42_3
+; SI-NEXT:  .LBB42_2:
+; SI-NEXT:    s_mov_b64 s[6:7], -1
+; SI-NEXT:  .LBB42_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s6, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB42_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB42_3
-; SI-NEXT:  .LBB42_2: ; %if
 ; SI-NEXT:    s_load_dword s5, s[2:3], 0x0
-; SI-NEXT:  .LBB42_3: ; %endif
+; SI-NEXT:  .LBB42_5: ; %endif
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
@@ -1987,8 +1992,6 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB42_4:
-; SI-NEXT:    s_branch .LBB42_2
 ;
 ; VI-LABEL: insert_split_bb:
 ; VI:       ; %bb.0: ; %entry
@@ -1996,14 +1999,22 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    s_cbranch_scc0 .LBB42_4
+; VI-NEXT:    s_cbranch_scc0 .LBB42_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_load_dword s5, s[2:3], 0x4
-; VI-NEXT:    s_cbranch_execnz .LBB42_3
-; VI-NEXT:  .LBB42_2: ; %if
+; VI-NEXT:    s_mov_b64 s[6:7], 0
+; VI-NEXT:    s_branch .LBB42_3
+; VI-NEXT:  .LBB42_2:
+; VI-NEXT:    s_mov_b64 s[6:7], -1
+; VI-NEXT:  .LBB42_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; VI-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s6, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB42_5
+; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s5, s[2:3], 0x0
-; VI-NEXT:  .LBB42_3: ; %endif
+; VI-NEXT:  .LBB42_5: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
@@ -2011,8 +2022,6 @@ define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB42_4:
-; VI-NEXT:    s_branch .LBB42_2
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %1 = icmp eq i32 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 80cd97c0c262c0..fb9c5ba9bc3100 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -117,9 +117,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; SI-NEXT:    s_load_dword s1, s[6:7], 0x32
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s0, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s0, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB1_2
 ; SI-NEXT:  ; %bb.1: ; %bb0
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
@@ -135,9 +135,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; CI-SDAG-NEXT:    s_load_dword s1, s[6:7], 0x32
 ; CI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
-; CI-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CI-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; CI-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
+; CI-SDAG-NEXT:    s_cselect_b32 s0, 1, 0
+; CI-SDAG-NEXT:    s_cmp_lg_u32 s0, 1
+; CI-SDAG-NEXT:    s_cbranch_scc1 .LBB1_2
 ; CI-SDAG-NEXT:  ; %bb.1: ; %bb0
 ; CI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-SDAG-NEXT:    flat_store_dword v[0:1], v0
@@ -151,9 +151,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
-; GFX9-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GFX9-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX9-SDAG-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 8383621cef2f6b..b538a6920c6e97 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -184,9 +184,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; SI-NEXT:    s_load_dword s1, s[6:7], 0x33
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s0, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s0, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB1_2
 ; SI-NEXT:  ; %bb.1: ; %bb0
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
@@ -202,9 +202,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; CI-SDAG-NEXT:    s_load_dword s1, s[6:7], 0x33
 ; CI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-SDAG-NEXT:    s_cmp_eq_u32 s0, s1
-; CI-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CI-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; CI-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
+; CI-SDAG-NEXT:    s_cselect_b32 s0, 1, 0
+; CI-SDAG-NEXT:    s_cmp_lg_u32 s0, 1
+; CI-SDAG-NEXT:    s_cbranch_scc1 .LBB1_2
 ; CI-SDAG-NEXT:  ; %bb.1: ; %bb0
 ; CI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-SDAG-NEXT:    flat_store_dword v[0:1], v0
@@ -218,9 +218,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cmp_eq_u32 s2, s1
-; GFX9-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-SDAG-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GFX9-SDAG-NEXT:    s_cbranch_vccnz .LBB1_2
+; GFX9-SDAG-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-SDAG-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6fb5a9ce47a843..15a38292ef51f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -102,22 +102,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 56
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b64 s[2:3], -1
-; GCN-NEXT:    s_cbranch_scc1 .LBB4_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN-NEXT:    s_cbranch_vccz .LBB4_4
-; GCN-NEXT:  .LBB4_2: ; %.exit
-; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB4_3: ; %.one
+; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
+; GCN-NEXT:  ; %bb.1: ; %.one
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0
-; GCN-NEXT:    s_cbranch_execnz .LBB4_2
-; GCN-NEXT:  .LBB4_4: ; %.zero
+; GCN-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-NEXT:  .LBB4_2: ; %Flow
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s2, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s2, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB4_4
+; GCN-NEXT:  ; %bb.3: ; %.zero
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT:  .LBB4_4: ; %.exit
 ; GCN-NEXT:    s_endpgm
   %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
   %cmp = icmp eq i32 %val, 56
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index d618b937910276..eda1dfb5450795 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1162,53 +1162,23 @@ define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
 }
 
 define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
-; GFX6-LABEL: s_set_rounding_select_0_1:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_select_0_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_select_0_1:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
-; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
-; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX678-LABEL: s_set_rounding_select_0_1:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX678-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX678-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX678-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_set_rounding_select_0_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
-; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
-; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX9-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1216,11 +1186,9 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
+; GFX10-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1228,11 +1196,9 @@ define amdgpu_gfx void @s_set_rounding_select_0_1(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_lshr_b32 s0, 0xa50f, s0
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1324,53 +1290,23 @@ define void @v_set_rounding_select_1_3(i32 %cond) {
 }
 
 define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
-; GFX6-LABEL: s_set_rounding_select_2_0:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX6-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: s_set_rounding_select_2_0:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX7-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT:    v_lshr_b32_e32 v0, 0xa50f, v0
-; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: s_set_rounding_select_2_0:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_mov_b32 s34, 0xa50f
-; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
-; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
-; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX678-LABEL: s_set_rounding_select_2_0:
+; GFX678:       ; %bb.0:
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX678-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX678-NEXT:    s_lshl_b32 s34, s34, 3
+; GFX678-NEXT:    s_lshr_b32 s34, 0xa50f, s34
+; GFX678-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_set_rounding_select_2_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_mov_b32 s34, 0xa50f
-; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s34
-; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX9-NEXT:    s_lshl_b32 s34, s34, 3
+; GFX9-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1378,11 +1314,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX10-NEXT:    s_lshl_b32 s34, s34, 3
+; GFX10-NEXT:    s_lshr_b32 s34, 0xa50f, s34
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1390,11 +1324,9 @@ define amdgpu_gfx void @s_set_rounding_select_2_0(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, 0xa50f
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 3
+; GFX11-NEXT:    s_lshr_b32 s0, 0xa50f, s0
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %cond, 0
@@ -1530,9 +1462,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX678-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX678-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX678-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX678-NEXT:    s_cselect_b32 s34, 1, 0
 ; GFX678-NEXT:    s_lshl_b32 s34, s34, 2
 ; GFX678-NEXT:    s_add_i32 s35, s34, -4
 ; GFX678-NEXT:    s_min_u32 s34, s34, s35
@@ -1547,9 +1477,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b64 s[34:35], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[34:35]
-; GFX9-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
 ; GFX9-NEXT:    s_lshl_b32 s34, s34, 2
 ; GFX9-NEXT:    s_add_i32 s35, s34, -4
 ; GFX9-NEXT:    s_min_u32 s34, s34, s35
@@ -1564,9 +1492,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s34, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s34
-; GFX10-NEXT:    v_readfirstlane_b32 s34, v0
+; GFX10-NEXT:    s_cselect_b32 s34, 1, 0
 ; GFX10-NEXT:    s_lshl_b32 s34, s34, 2
 ; GFX10-NEXT:    s_add_i32 s35, s34, -4
 ; GFX10-NEXT:    s_min_u32 s34, s34, s35
@@ -1581,9 +1507,7 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_add_i32 s1, s0, -4
 ; GFX11-NEXT:    s_min_u32 s0, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
index 7814eb603e5541..f81a9840e5a16a 100644
--- a/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll
@@ -258,23 +258,33 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
 ; GCN-NEXT:    s_mov_b64 s[0:1], -1
-; GCN-NEXT:    s_cbranch_scc0 .LBB5_1
-; GCN-NEXT:  ; %bb.7: ; %bb0
+; GCN-NEXT:    s_cbranch_scc1 .LBB5_1
+; GCN-NEXT:  ; %bb.5: ; %bb0
 ; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:  .Lpost_getpc5:
-; GCN-NEXT:    s_add_u32 s8, s8, (.LBB5_4-.Lpost_getpc5)&4294967295
-; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB5_4-.Lpost_getpc5)>>32
+; GCN-NEXT:  .Lpost_getpc4:
+; GCN-NEXT:    s_add_u32 s8, s8, (.LBB5_2-.Lpost_getpc4)&4294967295
+; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB5_2-.Lpost_getpc4)>>32
 ; GCN-NEXT:    s_setpc_b64 s[8:9]
-; GCN-NEXT:  .LBB5_1: ; %Flow
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GCN-NEXT:    s_cbranch_vccnz .LBB5_3
-; GCN-NEXT:  .LBB5_2: ; %bb2
+; GCN-NEXT:  .LBB5_1: ; %bb3
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    v_nop_e64
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:  .LBB5_2: ; %Flow
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB5_4
+; GCN-NEXT:  ; %bb.3: ; %bb2
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 17
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:  .LBB5_3: ; %bb4
+; GCN-NEXT:  .LBB5_4: ; %bb4
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
@@ -284,27 +294,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB5_4: ; %bb3
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    v_nop_e64
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_mov_b64 vcc, exec
-; GCN-NEXT:    s_cbranch_execnz .LBB5_5
-; GCN-NEXT:  ; %bb.9: ; %bb3
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:  .Lpost_getpc6:
-; GCN-NEXT:    s_add_u32 s8, s8, (.LBB5_2-.Lpost_getpc6)&4294967295
-; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB5_2-.Lpost_getpc6)>>32
-; GCN-NEXT:    s_setpc_b64 s[8:9]
-; GCN-NEXT:  .LBB5_5: ; %bb3
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:  .Lpost_getpc4:
-; GCN-NEXT:    s_add_u32 s8, s8, (.LBB5_3-.Lpost_getpc4)&4294967295
-; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB5_3-.Lpost_getpc4)>>32
-; GCN-NEXT:    s_setpc_b64 s[8:9]
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 8157b1a7f7c802..c0840cfbcbe859 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -27,11 +27,20 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:  ; %bb.2: ; %loop-memcpy-residual-header
 ; CHECK-NEXT:    s_branch .LBB0_4
 ; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    s_mov_b64 s[4:5], -1
 ; CHECK-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; CHECK-NEXT:    s_branch .LBB0_5
+; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 1
+; CHECK-NEXT:    s_cbranch_scc0 .LBB0_5
+; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge
 ; CHECK-NEXT:    v_lshlrev_b64 v[6:7], 6, v[2:3]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_7
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_7
 ; CHECK-NEXT:  .LBB0_5: ; %loop-memcpy-residual.preheader
 ; CHECK-NEXT:    v_or_b32_e32 v2, 32, v4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v5
@@ -66,8 +75,10 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:  .LBB0_9: ; %Flow16
 ; CHECK-NEXT:    ; in Loop: Header=BB0_10 Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_18
+; CHECK-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; CHECK-NEXT:    s_cselect_b32 s8, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s8, 1
+; CHECK-NEXT:    s_cbranch_scc0 .LBB0_18
 ; CHECK-NEXT:  .LBB0_10: ; %while.cond
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_12 Depth 2
@@ -155,11 +166,20 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    s_mov_b32 s5, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB1_4
 ; CHECK-NEXT:  ; %bb.3:
+; CHECK-NEXT:    s_mov_b64 s[6:7], -1
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT:    s_branch .LBB1_5
+; CHECK-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 1
+; CHECK-NEXT:    s_cbranch_scc0 .LBB1_5
+; CHECK-NEXT:    s_branch .LBB1_8
 ; CHECK-NEXT:  .LBB1_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge
 ; CHECK-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; CHECK-NEXT:    s_cbranch_execnz .LBB1_8
+; CHECK-NEXT:    s_mov_b64 s[6:7], 0
+; CHECK-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; CHECK-NEXT:    s_cselect_b32 s6, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s6, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB1_8
 ; CHECK-NEXT:  .LBB1_5: ; %loop-memcpy-residual.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    s_mov_b64 s[6:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index ff39439a2db1af..adb7348fb56d10 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2455,13 +2455,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_lt_u32 s4, s5
-; CI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; CI-NEXT:    s_cselect_b32 s0, s4, s5
-; CI-NEXT:    v_mov_b32_e32 v5, s0
-; CI-NEXT:    flat_store_dword v[0:1], v5
-; CI-NEXT:    flat_store_byte v[2:3], v4
+; CI-NEXT:    s_cselect_b32 s1, 1, 0
+; CI-NEXT:    v_mov_b32_e32 v4, s0
+; CI-NEXT:    v_mov_b32_e32 v5, s1
+; CI-NEXT:    flat_store_dword v[0:1], v4
+; CI-NEXT:    flat_store_byte v[2:3], v5
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
@@ -2476,13 +2475,12 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lt_u32 s4, s5
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; VI-NEXT:    s_cselect_b32 s0, s4, s5
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    flat_store_dword v[0:1], v5
-; VI-NEXT:    flat_store_byte v[2:3], v4
+; VI-NEXT:    s_cselect_b32 s1, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_byte v[2:3], v5
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
@@ -2490,55 +2488,51 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s2, s[12:13], 0x0
-; GFX9-NEXT:    s_load_dword s3, s[14:15], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[12:13], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_lt_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, s2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    global_store_dword v0, v2, s[8:9]
-; GFX9-NEXT:    global_store_byte v0, v1, s[10:11]
+; GFX9-NEXT:    s_cmp_lt_u32 s0, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX9-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dword s0, s[12:13], 0x0
 ; GFX10-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, s1
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    s_cselect_b32 s0, s0, s1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    global_store_dword v1, v2, s[8:9]
-; GFX10-NEXT:    global_store_byte v1, v0, s[10:11]
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX10-NEXT:    global_store_byte v0, v2, s[10:11]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[2:3], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
 ; GFX11-NEXT:    s_load_b32 s5, s[6:7], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, s5
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s4
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX11-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %aptr, align 4
   %b = load i32, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 03de142a41b4c7..b3bf7565f2c23c 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2092,13 +2092,14 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:  .LBB15_3: ; %Flow
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB15_5
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB15_5
 ; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
@@ -2106,6 +2107,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:  .LBB15_5:
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:  .LBB15_6: ; %endif
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -2127,8 +2129,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    ; implicit-def: $sgpr6
 ; VI-NEXT:  .LBB15_3: ; %Flow
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; VI-NEXT:    s_cbranch_vccnz .LBB15_5
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB15_5
 ; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -2162,8 +2166,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-NEXT:  .LBB15_3: ; %Flow
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB15_5
 ; GFX9-NEXT:  ; %bb.4: ; %if
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -2197,8 +2203,10 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    ; implicit-def: $sgpr0
 ; GFX10-NEXT:  .LBB15_3: ; %Flow
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
-; GFX10-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX10-NEXT:    s_and_b32 s1, s8, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 1
+; GFX10-NEXT:    s_cbranch_scc1 .LBB15_5
 ; GFX10-NEXT:  ; %bb.4: ; %if
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s2, -1
@@ -2220,20 +2228,23 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-LABEL: mul32_in_branch:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX11-NEXT:  ; %bb.1: ; %else
-; GFX11-NEXT:    s_mul_i32 s5, s0, s1
+; GFX11-NEXT:    s_mul_i32 s4, s0, s1
 ; GFX11-NEXT:    s_branch .LBB15_3
 ; GFX11-NEXT:  .LBB15_2:
-; GFX11-NEXT:    s_mov_b32 s4, -1
-; GFX11-NEXT:    ; implicit-def: $sgpr5
+; GFX11-NEXT:    s_mov_b32 s5, -1
+; GFX11-NEXT:    ; implicit-def: $sgpr4
 ; GFX11-NEXT:  .LBB15_3: ; %Flow
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX11-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB15_5
 ; GFX11-NEXT:  ; %bb.4: ; %if
 ; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s6, -1
@@ -2243,7 +2254,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_branch .LBB15_6
 ; GFX11-NEXT:  .LBB15_5:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:  .LBB15_6: ; %endif
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
@@ -2255,20 +2266,23 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-LABEL: mul32_in_branch:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX12-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX12-NEXT:  ; %bb.1: ; %else
-; GFX12-NEXT:    s_mul_i32 s5, s0, s1
+; GFX12-NEXT:    s_mul_i32 s4, s0, s1
 ; GFX12-NEXT:    s_branch .LBB15_3
 ; GFX12-NEXT:  .LBB15_2:
-; GFX12-NEXT:    s_mov_b32 s4, -1
-; GFX12-NEXT:    ; implicit-def: $sgpr5
+; GFX12-NEXT:    s_mov_b32 s5, -1
+; GFX12-NEXT:    ; implicit-def: $sgpr4
 ; GFX12-NEXT:  .LBB15_3: ; %Flow
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX12-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX12-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s5, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB15_5
 ; GFX12-NEXT:  ; %bb.4: ; %if
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s6, -1
@@ -2278,7 +2292,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
 ; GFX12-NEXT:    s_branch .LBB15_6
 ; GFX12-NEXT:  .LBB15_5:
-; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:  .LBB15_6: ; %endif
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
@@ -2349,7 +2363,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
 ; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
-; SI-NEXT:    s_cbranch_vccz .LBB16_4
+; SI-NEXT:    s_cbranch_vccz .LBB16_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mul_hi_u32 v0, s4, v0
@@ -2359,23 +2373,27 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT:    s_cbranch_vccnz .LBB16_3
-; SI-NEXT:  .LBB16_2: ; %if
+; SI-NEXT:    s_branch .LBB16_3
+; SI-NEXT:  .LBB16_2:
+; SI-NEXT:    s_mov_b64 s[8:9], -1
+; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SI-NEXT:  .LBB16_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[4:5], s[8:9], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB16_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; SI-NEXT:  .LBB16_3: ; %endif
+; SI-NEXT:  .LBB16_5: ; %endif
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB16_4:
-; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SI-NEXT:    s_branch .LBB16_2
 ;
 ; VI-LABEL: mul64_in_branch:
 ; VI:       ; %bb.0: ; %entry
@@ -2383,7 +2401,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; VI-NEXT:    s_cbranch_scc0 .LBB16_4
+; VI-NEXT:    s_cbranch_scc0 .LBB16_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
@@ -2391,23 +2409,27 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
 ; VI-NEXT:    s_mul_i32 s4, s5, s6
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; VI-NEXT:    s_cbranch_vccnz .LBB16_3
-; VI-NEXT:  .LBB16_2: ; %if
+; VI-NEXT:    s_branch .LBB16_3
+; VI-NEXT:  .LBB16_2:
+; VI-NEXT:    s_mov_b64 s[8:9], -1
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:  .LBB16_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[4:5], s[8:9], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB16_5
+; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_mov_b32 s4, s2
 ; VI-NEXT:    s_mov_b32 s5, s3
 ; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT:  .LBB16_3: ; %endif
+; VI-NEXT:  .LBB16_5: ; %endif
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB16_4:
-; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; VI-NEXT:    s_branch .LBB16_2
 ;
 ; GFX9-LABEL: mul64_in_branch:
 ; GFX9:       ; %bb.0: ; %entry
@@ -2415,7 +2437,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_cbranch_scc0 .LBB16_3
+; GFX9-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX9-NEXT:  ; %bb.1: ; %else
 ; GFX9-NEXT:    s_mul_i32 s2, s8, s11
 ; GFX9-NEXT:    s_mul_hi_u32 s3, s8, s10
@@ -2423,22 +2445,26 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_mul_i32 s3, s9, s10
 ; GFX9-NEXT:    s_add_i32 s3, s2, s3
 ; GFX9-NEXT:    s_mul_i32 s2, s8, s10
-; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
-; GFX9-NEXT:    s_cbranch_vccnz .LBB16_4
-; GFX9-NEXT:  .LBB16_2: ; %if
+; GFX9-NEXT:    s_branch .LBB16_3
+; GFX9-NEXT:  .LBB16_2:
+; GFX9-NEXT:    s_mov_b64 s[0:1], -1
+; GFX9-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-NEXT:  .LBB16_3: ; %Flow
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX9-NEXT:  ; %bb.4: ; %if
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_mov_b32 s0, s6
 ; GFX9-NEXT:    s_mov_b32 s1, s7
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT:    s_branch .LBB16_5
-; GFX9-NEXT:  .LBB16_3:
-; GFX9-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GFX9-NEXT:    s_branch .LBB16_2
-; GFX9-NEXT:  .LBB16_4:
+; GFX9-NEXT:    s_branch .LBB16_6
+; GFX9-NEXT:  .LBB16_5:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:  .LBB16_5: ; %endif
+; GFX9-NEXT:  .LBB16_6: ; %endif
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2450,7 +2476,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB16_3
+; GFX10-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX10-NEXT:  ; %bb.1: ; %else
 ; GFX10-NEXT:    s_mul_i32 s0, s8, s11
 ; GFX10-NEXT:    s_mul_hi_u32 s1, s8, s10
@@ -2458,21 +2484,27 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_add_i32 s0, s1, s0
 ; GFX10-NEXT:    s_add_i32 s1, s0, s2
 ; GFX10-NEXT:    s_mul_i32 s0, s8, s10
-; GFX10-NEXT:    s_cbranch_execnz .LBB16_4
-; GFX10-NEXT:  .LBB16_2: ; %if
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_branch .LBB16_3
+; GFX10-NEXT:  .LBB16_2:
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX10-NEXT:  .LBB16_3: ; %Flow
+; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX10-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX10-NEXT:  ; %bb.4: ; %if
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_mov_b32 s0, s6
 ; GFX10-NEXT:    s_mov_b32 s1, s7
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX10-NEXT:    s_branch .LBB16_5
-; GFX10-NEXT:  .LBB16_3:
-; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; GFX10-NEXT:    s_branch .LBB16_2
-; GFX10-NEXT:  .LBB16_4:
+; GFX10-NEXT:    s_branch .LBB16_6
+; GFX10-NEXT:  .LBB16_5:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:  .LBB16_5: ; %endif
+; GFX10-NEXT:  .LBB16_6: ; %endif
 ; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2484,7 +2516,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB16_3
+; GFX11-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX11-NEXT:  ; %bb.1: ; %else
 ; GFX11-NEXT:    s_mul_i32 s7, s4, s7
 ; GFX11-NEXT:    s_mul_hi_u32 s8, s4, s6
@@ -2492,20 +2524,27 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_add_i32 s7, s8, s7
 ; GFX11-NEXT:    s_mul_i32 s4, s4, s6
 ; GFX11-NEXT:    s_add_i32 s5, s7, s5
-; GFX11-NEXT:    s_cbranch_execnz .LBB16_4
-; GFX11-NEXT:  .LBB16_2: ; %if
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_branch .LBB16_3
+; GFX11-NEXT:  .LBB16_2:
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT:  .LBB16_3: ; %Flow
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX11-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX11-NEXT:  ; %bb.4: ; %if
 ; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_mov_b32 s4, s2
 ; GFX11-NEXT:    s_mov_b32 s5, s3
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_branch .LBB16_5
-; GFX11-NEXT:  .LBB16_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX11-NEXT:    s_branch .LBB16_2
-; GFX11-NEXT:  .LBB16_4:
+; GFX11-NEXT:    s_branch .LBB16_6
+; GFX11-NEXT:  .LBB16_5:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT:  .LBB16_5: ; %endif
+; GFX11-NEXT:  .LBB16_6: ; %endif
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2517,23 +2556,30 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX12-NEXT:    s_cbranch_scc0 .LBB16_3
+; GFX12-NEXT:    s_cbranch_scc0 .LBB16_2
 ; GFX12-NEXT:  ; %bb.1: ; %else
 ; GFX12-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
-; GFX12-NEXT:    s_cbranch_execnz .LBB16_4
-; GFX12-NEXT:  .LBB16_2: ; %if
+; GFX12-NEXT:    s_mov_b32 s6, 0
+; GFX12-NEXT:    s_branch .LBB16_3
+; GFX12-NEXT:  .LBB16_2:
+; GFX12-NEXT:    s_mov_b32 s6, -1
+; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT:  .LBB16_3: ; %Flow
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s6, 1
+; GFX12-NEXT:    s_cbranch_scc1 .LBB16_5
+; GFX12-NEXT:  ; %bb.4: ; %if
 ; GFX12-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s6, -1
 ; GFX12-NEXT:    s_mov_b32 s4, s2
 ; GFX12-NEXT:    s_mov_b32 s5, s3
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
-; GFX12-NEXT:    s_branch .LBB16_5
-; GFX12-NEXT:  .LBB16_3:
-; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX12-NEXT:    s_branch .LBB16_2
-; GFX12-NEXT:  .LBB16_4:
+; GFX12-NEXT:    s_branch .LBB16_6
+; GFX12-NEXT:  .LBB16_5:
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT:  .LBB16_5: ; %endif
+; GFX12-NEXT:  .LBB16_6: ; %endif
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 4ee2b8e981f449..35175ef8dfdfc1 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -4,14 +4,14 @@
 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-LABEL: negated_cond:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s7, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    s_mov_b32 s9, s5
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_branch .LBB0_2
 ; GCN-NEXT:  .LBB0_1: ; %loop.exit.guard
@@ -21,26 +21,31 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:  .LBB0_2: ; %bb1
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
-; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
-; GCN-NEXT:    s_mov_b32 s12, s6
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    s_cmp_eq_u32 s10, 0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_mov_b32 s12, s2
 ; GCN-NEXT:    s_branch .LBB0_4
 ; GCN-NEXT:  .LBB0_3: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-NEXT:    s_cbranch_vccz .LBB0_1
+; GCN-NEXT:    s_and_b64 s[16:17], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-NEXT:    s_cbranch_scc0 .LBB0_1
 ; GCN-NEXT:  .LBB0_4: ; %bb2
 ; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GCN-NEXT:    s_lshl_b32 s12, s12, 5
-; GCN-NEXT:    s_cbranch_vccz .LBB0_6
+; GCN-NEXT:    s_and_b64 s[14:15], s[8:9], exec
+; GCN-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-NEXT:    s_cbranch_scc0 .LBB0_6
 ; GCN-NEXT:  ; %bb.5: ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[14:15], s[10:11]
 ; GCN-NEXT:    s_branch .LBB0_7
 ; GCN-NEXT:  .LBB0_6: ; %bb3
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
@@ -48,9 +53,11 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    s_mov_b64 s[14:15], -1
 ; GCN-NEXT:  .LBB0_7: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GCN-NEXT:    s_and_b64 s[16:17], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s13, 1
 ; GCN-NEXT:    s_mov_b64 s[16:17], -1
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_3
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GCN-NEXT:  ; %bb.8: ; %bb4
 ; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
 ; GCN-NEXT:    s_ashr_i32 s13, s12, 31
@@ -58,7 +65,7 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s16
 ; GCN-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 32
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-NEXT:    s_branch .LBB0_3
@@ -92,47 +99,52 @@ bb4:
 define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) {
 ; GCN-LABEL: negated_cond_dominated_blocks:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_mov_b32 s3, s6
+; GCN-NEXT:    s_mov_b32 s5, s2
 ; GCN-NEXT:    s_branch .LBB1_2
 ; GCN-NEXT:  .LBB1_1: ; %bb7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], 2
-; GCN-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NEXT:    s_cmp_eq_u32 s2, 32
-; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; GCN-NEXT:    s_mov_b32 s3, s2
-; GCN-NEXT:    s_cbranch_scc1 .LBB1_6
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[4:5], 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NEXT:    s_cmp_eq_u32 s4, 32
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GCN-NEXT:  .LBB1_2: ; %bb4
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-NEXT:    s_cbranch_vccz .LBB1_4
 ; GCN-NEXT:  ; %bb.3: ; %bb6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_add_i32 s2, s3, 1
-; GCN-NEXT:    s_mov_b64 vcc, exec
-; GCN-NEXT:    s_cbranch_execnz .LBB1_1
+; GCN-NEXT:    s_add_i32 s4, s5, 1
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_branch .LBB1_5
 ; GCN-NEXT:  .LBB1_4: ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    ; implicit-def: $sgpr2
-; GCN-NEXT:    s_mov_b64 vcc, 0
-; GCN-NEXT:  .LBB1_5: ; %bb5
+; GCN-NEXT:    s_mov_b64 s[6:7], -1
+; GCN-NEXT:    ; implicit-def: $sgpr4
+; GCN-NEXT:  .LBB1_5: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    s_lshl_b32 s2, s3, 5
-; GCN-NEXT:    s_or_b32 s2, s2, 1
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s6, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s6, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB1_1
+; GCN-NEXT:  ; %bb.6: ; %bb5
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_lshl_b32 s4, s5, 5
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    s_branch .LBB1_1
-; GCN-NEXT:  .LBB1_6: ; %bb3
+; GCN-NEXT:  .LBB1_7: ; %bb3
 ; GCN-NEXT:    s_endpgm
 bb:
   br label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index eff80236d98663..c4e155314370c3 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -1180,7 +1180,9 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_max_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_cmp_le_f32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX6-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -1207,7 +1209,9 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-NEXT:    v_max_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1254,7 +1258,9 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
 ; GFX6-NEXT:    s_cmp_eq_u32 s6, s7
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -1270,7 +1276,9 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, s7
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 0d88466fc31b3e..d7b2a5401fece8 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -519,8 +519,8 @@ define amdgpu_kernel void @alloca_promote_atomicrmw_private_lds_promote(ptr addr
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 1
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -561,8 +561,8 @@ define amdgpu_kernel void @alloca_promote_cmpxchg_private(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 1
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 8f4a4b5afcdc1e..ab76abbb44a6ce 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -19,8 +19,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -106,8 +108,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -199,8 +203,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; SI-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-SDAG-NEXT:    s_lshl_b32 s4, s4, 8
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -286,8 +292,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; VI-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-SDAG-NEXT:    s_lshl_b32 s4, s4, 8
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; VI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -380,8 +388,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -467,8 +477,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -560,8 +572,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -647,8 +661,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
@@ -4521,8 +4537,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
-; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4592,8 +4610,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-SDAG-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index d8deb810971106..586290295c5860 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -141,12 +141,14 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s0, s6
 ; SI-NEXT:    s_mov_b32 s1, s7
 ; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    v_mov_b32_e32 v0, s14
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -161,13 +163,15 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    s_cmp_lt_i32 s4, s0
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -253,7 +257,9 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v0
 ; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -277,8 +283,10 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v5
 ; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v6, v4
 ; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dword v[0:1], v6
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -357,17 +365,19 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
 ; SI-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    v_mov_b32_e32 v1, s13
-; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-NEXT:    s_mov_b32 s0, s2
 ; SI-NEXT:    s_mov_b32 s1, s3
 ; SI-NEXT:    s_mov_b32 s2, s10
 ; SI-NEXT:    s_mov_b32 s3, s11
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -377,19 +387,21 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s0, s4, s6
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-NEXT:    s_addc_u32 s1, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[3:4]
+; VI-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT:    s_xor_b64 s[2:3], s[4:5], vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -407,8 +419,10 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -419,12 +433,14 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s0, s8, s10
 ; GFX10-NEXT:    s_addc_u32 s1, s9, s11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[10:11], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[8:9]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-NEXT:    s_xor_b32 s0, s2, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    s_xor_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    global_store_byte v2, v3, s[6:7]
 ; GFX10-NEXT:    s_endpgm
@@ -435,13 +451,16 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s8, s4, s6
 ; GFX11-NEXT:    s_addc_u32 s9, s5, s7
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[6:7], 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX11-NEXT:    v_mov_b32_e32 v0, s8
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s4, s6, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s5, s[6:7], 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    global_store_b8 v2, v3, s[2:3]
@@ -482,7 +501,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
 ; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -507,7 +528,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
 ; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[6:7], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -525,7 +548,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    global_store_byte v6, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -540,10 +565,12 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT:    s_xor_b32 s0, s0, vcc_lo
+; GFX10-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[4:5], s[4:5]
 ; GFX10-NEXT:    global_store_byte v6, v0, s[6:7]
 ; GFX10-NEXT:    s_endpgm
@@ -559,12 +586,14 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX11-NEXT:    s_xor_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[4:5]
 ; GFX11-NEXT:    global_store_b8 v6, v0, s[6:7]
@@ -603,14 +632,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v4, v0
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v5, v1
+; SI-NEXT:    s_xor_b64 s[2:3], vcc, s[2:3]
 ; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
@@ -632,14 +665,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, v1, v3
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
-; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v1
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
+; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], 0, v3
 ; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], v8, v0
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], v9, v1
+; VI-NEXT:    s_xor_b64 s[2:3], vcc, s[2:3]
 ; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
-; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[2:3]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[8:9]
 ; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index f4776747f16ac1..3fab6e805c77d2 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -142,7 +142,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
@@ -168,17 +167,22 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[18:19], exec
+; GCN-IR-NEXT:    s_cselect_b32 s15, 1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GCN-IR-NEXT:    s_cmp_lg_u32 s15, 1
+; GCN-IR-NEXT:    s_mov_b32 s15, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s17, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s17, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT:    s_cmp_lg_u32 s17, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s18
 ; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
@@ -1067,7 +1071,6 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-LABEL: s_test_sdiv24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[4:5], 24
@@ -1101,17 +1104,22 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[18:19], exec
+; GCN-IR-NEXT:    s_cselect_b32 s15, 1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
+; GCN-IR-NEXT:    s_cmp_lg_u32 s15, 1
+; GCN-IR-NEXT:    s_mov_b32 s15, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s17, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
-; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s17, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s16
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT:    s_cmp_lg_u32 s17, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s18
 ; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
@@ -1303,17 +1311,21 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_and_b64 s[8:9], s[12:13], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
 ; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s10
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s10
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s8
+; GCN-IR-NEXT:    s_cmp_lg_u32 s11, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB10_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s12
 ; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/select-i32-zext.ll b/llvm/test/CodeGen/AMDGPU/select-i32-zext.ll
new file mode 100644
index 00000000000000..bb0d6ca3149f56
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-i32-zext.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX900 %s
+
+; Test translation of select statement that requires a subsequent i1->i32 zext conversion.
+
+define amdgpu_kernel void @select_i32_zext_zext_a(i32 noundef %0, ptr addrspace(1) nocapture noundef readnone %1) {
+; GFX900-LABEL: select_i32_zext_zext_a:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX900-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX900-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    s_mov_b32 s0, s0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_endpgm
+  %3 = icmp eq i32 %0, 0
+  %4 = select i1 %3, i32 0, i32 32
+  %5 = tail call i32 asm sideeffect "s_mov_b32 $0, $1", "=s,s"(i32 %4) #1, !srcloc !7
+  ret void
+}
+
+define amdgpu_kernel void @select_i32_zext_zext_b(i32 noundef %0, ptr addrspace(1) nocapture noundef readnone %1) {
+; GFX900-LABEL: select_i32_zext_zext_b:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX900-NEXT:    s_cselect_b32 s0, 0, 33
+; GFX900-NEXT:    ;;#ASMSTART
+; GFX900-NEXT:    s_mov_b32 s0, s0
+; GFX900-NEXT:    ;;#ASMEND
+; GFX900-NEXT:    s_endpgm
+  %3 = icmp eq i32 %0, 0
+  %4 = select i1 %3, i32 0, i32 33
+  %5 = tail call i32 asm sideeffect "s_mov_b32 $0, $1", "=s,s"(i32 %4) #1, !srcloc !7
+  ret void
+}
+
+
+!7 = !{i64 173}
diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 5b9866a3c91571..16118b1f208add 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -47,7 +47,7 @@ declare float @llvm.amdgcn.rcp.f32(float)
 ; GCN-LABEL: {{^}}undef_v6f32:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -66,7 +66,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v6i32:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -86,7 +86,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v5f32:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -105,7 +105,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v5i32:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -125,7 +125,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v3f64:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -144,7 +144,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v3i64:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -164,7 +164,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v4f16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -183,7 +183,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v4i16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -203,7 +203,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v2f16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -222,7 +222,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v2i16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -262,7 +262,7 @@ define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
 ; GCN-LABEL: {{^}}undef_bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -284,7 +284,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v2bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -306,7 +306,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v3bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -328,7 +328,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v4bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -350,7 +350,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v6bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -372,7 +372,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v8bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -394,7 +394,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v16bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
@@ -416,7 +416,7 @@ ret:
 ; GCN-LABEL: {{^}}undef_v32bf16:
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) {
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index b477a72d3810ea..29eea6f06daf40 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -5,11 +5,9 @@
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT:buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
-
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[4:7], 0
+; GCN: s_endpgm
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
 define amdgpu_kernel void @sext_bool_icmp_eq_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
@@ -23,9 +21,8 @@ define amdgpu_kernel void @sext_bool_icmp_eq_0(ptr addrspace(1) %out, i32 %a, i3
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[4:7], 0 
 ; GCN-NEXT: s_endpgm
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
@@ -41,9 +38,8 @@ define amdgpu_kernel void @sext_bool_icmp_ne_0(ptr addrspace(1) %out, i32 %a, i3
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_eq_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @sext_bool_icmp_eq_neg1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
@@ -56,9 +52,8 @@ define amdgpu_kernel void @sext_bool_icmp_eq_neg1(ptr addrspace(1) %out, i32 %a,
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_eq_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @sext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
@@ -71,9 +66,8 @@ define amdgpu_kernel void @sext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a,
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @zext_bool_icmp_eq_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
@@ -86,9 +80,8 @@ define amdgpu_kernel void @zext_bool_icmp_eq_0(ptr addrspace(1) %out, i32 %a, i3
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_lg_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @zext_bool_icmp_ne_0(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
@@ -101,10 +94,6 @@ define amdgpu_kernel void @zext_bool_icmp_ne_0(ptr addrspace(1) %out, i32 %a, i3
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_eq_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @zext_bool_icmp_eq_1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -116,9 +105,6 @@ define amdgpu_kernel void @zext_bool_icmp_eq_1(ptr addrspace(1) %out, i32 %a, i3
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
 ; GCN-NOT: v_cmp
 ; GCN: s_cmp_eq_u32
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN-NEXT: buffer_store_byte [[RESULT]]
 define amdgpu_kernel void @zext_bool_icmp_ne_1(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
@@ -157,11 +143,9 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a,
 ; GCN: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff
 ; GCN: s_cmpk_lg_i32 [[B]], 0xff
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @cmp_zext_k_i8max(ptr addrspace(1) %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
@@ -203,10 +187,10 @@ define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind {
 ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], 0xff
 ; GCN: s_cmpk_lg_i32 [[B]], 0xff{{$}}
-; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN: buffer_store_byte [[RESULT]]
-; GCN: s_endpgm
+
+; GCN: s_cselect_b32
+; GCN: buffer_store_byte v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(ptr addrspace(1) %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 438d8d22947a45..b53e7a528c3bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -263,8 +263,8 @@ entry:
 ; GCN-LABEL: {{^}}i128_sle:
 ; GCN: v_cmp_le_i64
 ; CGV: v_cndmask
-; SI: v_cmp_eq_u64
-; VI: s_cmp_eq_u64
+; SI: v_cmp_le_u64
+; VI: v_cmp_le_u64
 define amdgpu_kernel void @i128_sle(ptr addrspace(1) %out, i128 %a, i128 %b) #0 {
 entry:
   %tmp0 = icmp sle i128 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 0630cca7c099b8..0b03ae44e6d64a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -15,13 +15,22 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i
 ; SI-NEXT:    s_load_dword s0, s[2:3], 0xf
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB0_4
+; SI-NEXT:    s_cbranch_scc0 .LBB0_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_add_i32 s7, s7, s0
-; SI-NEXT:    s_cbranch_execnz .LBB0_3
-; SI-NEXT:  .LBB0_2: ; %if
+; SI-NEXT:    s_mov_b64 s[0:1], 0
+; SI-NEXT:    s_branch .LBB0_3
+; SI-NEXT:  .LBB0_2:
+; SI-NEXT:    s_mov_b64 s[0:1], -1
+; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:  .LBB0_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s0, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB0_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_sub_i32 s7, s5, s6
-; SI-NEXT:  .LBB0_3: ; %endif
+; SI-NEXT:  .LBB0_5: ; %endif
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_add_i32 s4, s7, s4
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -30,9 +39,6 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(ptr addrspace(1) %out, i32 %a, i
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB0_4:
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    s_branch .LBB0_2
 
 entry:
   %0 = icmp eq i32 %a, 0
@@ -59,19 +65,28 @@ define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x
 ; SI-NEXT:    s_load_dword s4, s[2:3], 0x13
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cbranch_scc0 .LBB1_4
+; SI-NEXT:    s_cbranch_scc0 .LBB1_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_load_dword s0, s[2:3], 0x2e
 ; SI-NEXT:    s_load_dword s1, s[2:3], 0x37
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_i32 s5, s0, s1
-; SI-NEXT:    s_cbranch_execnz .LBB1_3
-; SI-NEXT:  .LBB1_2: ; %if
+; SI-NEXT:    s_mov_b64 s[0:1], 0
+; SI-NEXT:    s_branch .LBB1_3
+; SI-NEXT:  .LBB1_2:
+; SI-NEXT:    s_mov_b64 s[0:1], -1
+; SI-NEXT:    ; implicit-def: $sgpr5
+; SI-NEXT:  .LBB1_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s0, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB1_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_load_dword s0, s[2:3], 0x1c
 ; SI-NEXT:    s_load_dword s1, s[2:3], 0x25
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_i32 s5, s0, s1
-; SI-NEXT:  .LBB1_3: ; %endif
+; SI-NEXT:  .LBB1_5: ; %endif
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; SI-NEXT:    s_add_i32 s4, s5, s4
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
@@ -80,9 +95,6 @@ define amdgpu_kernel void @sgpr_if_else_salu_br_opt(ptr addrspace(1) %out, [8 x
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB1_4:
-; SI-NEXT:    ; implicit-def: $sgpr5
-; SI-NEXT:    s_branch .LBB1_2
 
 entry:
   %cmp0 = icmp eq i32 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index a3cb3cfba55522..638957ec13a378 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -210,9 +210,8 @@ ENDIF:                                            ; preds = %LOOP
 ; CHECK: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 7
 ; CHECK: s_cbranch
 
-; CHECK: BB{{[0-9]+_[0-9]+}}:
-; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_LO:[0-9]+]], 11
-; CHECK-DAG: v_mov_b32_e32 v[[SAMPLE_HI:[0-9]+]], 13
+; CHECK-DAG: v_mov_b32_e32
+; CHECK-DAG: v_mov_b32_e32
 
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v[[[SAMPLE_LO]]:[[SAMPLE_HI]]]
 ; CHECK: exp
@@ -309,13 +308,13 @@ ENDIF69:                                          ; preds = %LOOP68
 ; CHECK: s_cmp_eq_u32
 ; CHECK: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]]
 
-; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-; CHECK: s_endpgm
-
 ; [[END]]:
 ; CHECK: v_add_{{[iu]}}32_e32 v[[ADD:[0-9]+]], vcc, 1, v{{[0-9]+}}
+; CHECK: s_and_b64 exec, exec, s[16:17]
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]]
-; CHECK: s_branch
+
+; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+
 define amdgpu_ps void @sample_rsrc(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
   %tmp22 = load <4 x i32>, ptr addrspace(4) %arg1, !tbaa !3
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
index f232275c998d23..7fd2bf89d289b9 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -lowerswitch -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow -verify -S %s -o - | FileCheck -check-prefix=IR %s
 
@@ -107,91 +106,94 @@ define void @my_func(i32 %0) {
 ;
 ; GCN-LABEL: my_func:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_load_dword s10, s[4:5], 0x0
-; GCN-NEXT:    s_mov_b64 s[8:9], -1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lt_i32 s10, 1
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_7
-; GCN-NEXT:  ; %bb.1: ; %LeafBlock1
-; GCN-NEXT:    s_cmp_lg_u32 s10, 1
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_mov_b64 vcc, exec
-; GCN-NEXT:    s_cbranch_execz .LBB0_8
-; GCN-NEXT:  .LBB0_2: ; %Flow11
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; GCN-NEXT:    s_cbranch_execz .LBB0_4
-; GCN-NEXT:  .LBB0_3: ; %do.body
-; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], exec
-; GCN-NEXT:  .LBB0_4: ; %Flow17
-; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB0_6
-; GCN-NEXT:  ; %bb.5: ; %UnifiedUnreachableBlock
-; GCN-NEXT:    ; divergent unreachable
-; GCN-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
-; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB0_7: ; %Flow
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_2
-; GCN-NEXT:  .LBB0_8: ; %LeafBlock
-; GCN-NEXT:    s_cmp_eq_u32 s10, 0
-; GCN-NEXT:    s_cbranch_scc1 .LBB0_10
-; GCN-NEXT:  ; %bb.9:
-; GCN-NEXT:    s_mov_b64 s[6:7], -1
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; GCN-NEXT:    s_cbranch_execnz .LBB0_3
-; GCN-NEXT:    s_branch .LBB0_4
-; GCN-NEXT:  .LBB0_10: ; %NodeBlock7
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
-; GCN-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB0_12
-; GCN-NEXT:  ; %bb.11: ; %LeafBlock5
-; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
-; GCN-NEXT:    s_and_b64 s[8:9], vcc, exec
-; GCN-NEXT:  .LBB0_12: ; %Flow13
-; GCN-NEXT:    s_andn2_saveexec_b64 s[10:11], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB0_14
-; GCN-NEXT:  ; %bb.13: ; %LeafBlock3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
-; GCN-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GCN-NEXT:    s_and_b64 s[12:13], vcc, exec
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GCN-NEXT:  .LBB0_14: ; %Flow14
-; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_and_saveexec_b64 s[10:11], s[8:9]
-; GCN-NEXT:    s_cbranch_execz .LBB0_18
-; GCN-NEXT:  ; %bb.15: ; %LeafBlock9
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
-; GCN-NEXT:    s_mov_b64 s[8:9], -1
-; GCN-NEXT:    s_and_saveexec_b64 s[12:13], vcc
-; GCN-NEXT:    s_cbranch_execz .LBB0_17
-; GCN-NEXT:  ; %bb.16: ; %do.body.i.i.i.i
-; GCN-NEXT:    s_mov_b64 s[4:5], exec
-; GCN-NEXT:    s_xor_b64 s[8:9], exec, -1
-; GCN-NEXT:  .LBB0_17: ; %Flow16
-; GCN-NEXT:    s_or_b64 exec, exec, s[12:13]
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
-; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GCN-NEXT:  .LBB0_18: ; %Flow15
-; GCN-NEXT:    s_or_b64 exec, exec, s[10:11]
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; GCN-NEXT:    s_cbranch_execnz .LBB0_3
-; GCN-NEXT:    s_branch .LBB0_4
+; GCN-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:  s_mov_b64 s[4:5], 0
+; GCN-NEXT:  s_load_dword s10, s[4:5], 0x0
+; GCN-NEXT:  s_mov_b64 s[8:9], -1
+; GCN-NEXT:  s_waitcnt lgkmcnt(0)
+; GCN-NEXT:  s_cmp_lt_i32 s10, 1
+; GCN-NEXT:  s_mov_b64 s[6:7], 0
+; GCN-NEXT:  s_cbranch_scc1 .LBB0_2
+; GCN-NEXT:  ; %bb.1:                                ; %LeafBlock1
+; GCN-NEXT:  s_mov_b64 s[8:9], 0
+; GCN-NEXT:  s_cmp_lg_u32 s10, 1
+; GCN-NEXT:  s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:  .LBB0_2:                                ; %Flow
+; GCN-NEXT:  s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:  s_cselect_b32 s8, 1, 0
+; GCN-NEXT:  s_cmp_lg_u32 s8, 1
+; GCN-NEXT:  s_cbranch_scc1 .LBB0_13
+; GCN-NEXT:  ; %bb.3:                                ; %LeafBlock
+; GCN-NEXT:  s_cmp_eq_u32 s10, 0
+; GCN-NEXT:  s_cbranch_scc1 .LBB0_5
+; GCN-NEXT:  ; %bb.4:
+; GCN-NEXT:  s_mov_b64 s[6:7], -1
+; GCN-NEXT:  s_and_saveexec_b64 s[8:9], s[6:7]
+; GCN-NEXT:  s_cbranch_execnz .LBB0_14
+; GCN-NEXT:  s_branch .LBB0_15
+; GCN-NEXT:  .LBB0_5:                                ; %NodeBlock7
+; GCN-NEXT:  v_cmp_lt_i32_e32 vcc, 1, v0
+; GCN-NEXT:  s_mov_b64 s[8:9], 0
+; GCN-NEXT:  s_mov_b64 s[6:7], 0
+; GCN-NEXT:  s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:  s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT:  s_cbranch_execz .LBB0_7
+; GCN-NEXT:  ; %bb.6:                                ; %LeafBlock5
+; GCN-NEXT:  s_mov_b64 s[6:7], exec
+; GCN-NEXT:  v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:  s_and_b64 s[8:9], vcc, exec
+; GCN-NEXT:  .LBB0_7:                                ; %Flow13
+; GCN-NEXT:  s_andn2_saveexec_b64 s[10:11], s[4:5]
+; GCN-NEXT:  s_cbranch_execz .LBB0_9
+; GCN-NEXT:  ; %bb.8:                                ; %LeafBlock3
+; GCN-NEXT:  v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:  v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GCN-NEXT:  s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:  s_andn2_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:  s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:  s_and_b64 s[12:13], vcc, exec
+; GCN-NEXT:  s_or_b64 s[6:7], s[6:7], s[4:5]
+; GCN-NEXT:  s_or_b64 s[8:9], s[8:9], s[12:13]
+; GCN-NEXT:  .LBB0_9:                                ; %Flow14
+; GCN-NEXT:  s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:  s_mov_b64 s[4:5], 0
+; GCN-NEXT:  s_and_saveexec_b64 s[10:11], s[8:9]
+; GCN-NEXT:  s_cbranch_execz .LBB0_18
+; GCN-NEXT:  ; %bb.10:                               ; %LeafBlock9
+; GCN-NEXT:  v_cmp_lt_i32_e32 vcc, 1, v0
+; GCN-NEXT:  s_mov_b64 s[8:9], -1
+; GCN-NEXT:  s_and_saveexec_b64 s[12:13], vcc
+; GCN-NEXT:  s_cbranch_execz .LBB0_12
+; GCN-NEXT:  ; %bb.11:                               ; %do.body.i.i.i.i
+; GCN-NEXT:  s_mov_b64 s[4:5], exec
+; GCN-NEXT:  s_xor_b64 s[8:9], exec, -1
+; GCN-NEXT:  .LBB0_12:                               ; %Flow16
+; GCN-NEXT:  s_or_b64 exec, exec, s[12:13]
+; GCN-NEXT:  s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:  s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:  s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:  s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT:  s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:  .LBB0_13:                               ; %Flow11
+; GCN-NEXT:  s_and_saveexec_b64 s[8:9], s[6:7]
+; GCN-NEXT:  s_cbranch_execz .LBB0_15
+; GCN-NEXT:  .LBB0_14:                               ; %do.body
+; GCN-NEXT:  s_or_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:  .LBB0_15:                               ; %Flow17
+; GCN-NEXT:  s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT:  s_and_saveexec_b64 s[6:7], s[4:5]
+; GCN-NEXT:  s_cbranch_execz .LBB0_17
+; GCN-NEXT:  ; %bb.16:                               ; %UnifiedUnreachableBlock
+; GCN-NEXT:  ; divergent unreachable
+; GCN-NEXT:  .LBB0_17:                               ; %UnifiedReturnBlock
+; GCN-NEXT:  s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:  s_setpc_b64 s[30:31]
+; GCN-NEXT:  .LBB0_18:                               ; %Flow15
+; GCN-NEXT:  s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT:  s_and_saveexec_b64 s[8:9], s[6:7]
+; GCN-NEXT:  s_cbranch_execnz .LBB0_14
+; GCN-NEXT:  s_branch .LBB0_15
+
 entry:
   %1 = load i32, ptr addrspace(4) null, align 8
   switch i32 %1, label %do.body [
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 3446e0384cc545..3644b6623c08d3 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -861,22 +861,22 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; GFX9-NEXT:    global_load_dword v2, v0, s[10:11] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cmp_gt_i32_sdwa vcc, sext(v1), sext(v2) src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_cmp_gt_i32_sdwa s[0:1], sext(v1), sext(v2) src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v4, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
+; GFX9-NEXT:    v_lshl_or_b32 v5, v6, 16, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX9-NEXT:    global_store_dword v0, v4, s[4:5]
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT:    global_store_dword v0, v5, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -899,42 +899,39 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_readfirstlane_b32 s0, v4
 ; VI-NEXT:    v_readfirstlane_b32 s1, v5
-; VI-NEXT:    s_ashr_i32 s3, s0, 16
-; VI-NEXT:    s_ashr_i32 s5, s1, 16
-; VI-NEXT:    s_cmp_gt_i32 s3, s5
 ; VI-NEXT:    s_sext_i32_i16 s2, s0
-; VI-NEXT:    s_sext_i32_i16 s4, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, s3, s5
-; VI-NEXT:    s_cselect_b32 s3, s5, s3
-; VI-NEXT:    s_lshl_b32 s5, s0, 16
-; VI-NEXT:    s_cmp_gt_i32 s2, s4
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, s2, s4
-; VI-NEXT:    s_cselect_b32 s1, s4, s2
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
-; VI-NEXT:    s_lshl_b32 s2, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s5
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    s_or_b32 s1, s1, s2
+; VI-NEXT:    s_ashr_i32 s0, s0, 16
+; VI-NEXT:    s_sext_i32_i16 s3, s1
+; VI-NEXT:    s_ashr_i32 s1, s1, 16
+; VI-NEXT:    s_cmp_gt_i32 s0, s1
+; VI-NEXT:    s_cselect_b32 s4, s0, s1
+; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    s_cselect_b32 s1, 1, 0
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_cmp_gt_i32 s2, s3
+; VI-NEXT:    s_cselect_b32 s5, s2, s3
+; VI-NEXT:    s_cselect_b32 s2, s3, s2
+; VI-NEXT:    s_cselect_b32 s3, 1, 0
+; VI-NEXT:    s_lshl_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s1, s3, s1
+; VI-NEXT:    s_or_b32 s3, s5, s4
+; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    s_and_b32 s1, s1, 3
+; VI-NEXT:    v_mov_b32_e32 v4, s3
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    v_and_b32_e32 v4, 3, v4
 ; VI-NEXT:    v_mov_b32_e32 v6, s1
-; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_dword v[2:3], v6
+; VI-NEXT:    flat_store_dword v[2:3], v5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    flat_store_byte v[0:1], v4
+; VI-NEXT:    flat_store_byte v[0:1], v6
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -964,24 +961,24 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; CI-NEXT:    s_mov_b32 s5, s9
 ; CI-NEXT:    s_mov_b32 s12, s10
 ; CI-NEXT:    s_mov_b32 s13, s11
-; CI-NEXT:    v_cmp_gt_i32_e32 vcc, v0, v2
-; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], v1, v3
-; CI-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
-; CI-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CI-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; CI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; CI-NEXT:    buffer_store_short v5, off, s[4:7], 0 offset:2
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], v0, v2
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, v1, v3
+; CI-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v5, v2, v0, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; CI-NEXT:    buffer_store_short v4, off, s[4:7], 0 offset:2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v4, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v5, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 1, v3
 ; CI-NEXT:    buffer_store_short v1, off, s[12:15], 0 offset:2
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v0, off, s[12:15], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v2
-; CI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CI-NEXT:    v_or_b32_e32 v0, v1, v0
+; CI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; CI-NEXT:    v_and_b32_e32 v0, 3, v0
 ; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 4384d1e32cf53f..156dc89c21bc73 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -54,12 +54,11 @@ define void @test() {
 ; CHECK-NEXT:    s_mov_b64 exec, s[10:11]
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CHECK-NEXT:    s_mov_b32 s4, 1
-; CHECK-NEXT:    ; implicit-def: $sgpr5
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s4
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_mov_b32 s5, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s4, s5
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.6: ; %bb.5
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 1622f498dce65a..27c59f3b7a6c43 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1503,7 +1503,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[6:7]
 ; GCN-NEXT:    s_mov_b32 s0, 0
 ; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT:    s_cbranch_scc0 .LBB8_4
+; GCN-NEXT:    s_cbranch_scc0 .LBB8_2
 ; GCN-NEXT:  ; %bb.1:
 ; GCN-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-NEXT:    s_add_u32 s2, s6, s0
@@ -1514,6 +1514,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
+; GCN-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1642,8 +1643,16 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
 ; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB8_3
+; GCN-NEXT:    s_branch .LBB8_3
 ; GCN-NEXT:  .LBB8_2:
+; GCN-NEXT:    s_mov_b64 s[10:11], -1
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:  .LBB8_3: ; %Flow
+; GCN-NEXT:    s_and_b64 s[0:1], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB8_5
+; GCN-NEXT:  ; %bb.4:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    s_sub_i32 s0, 0, s6
 ; GCN-NEXT:    s_mov_b32 s1, 0
@@ -1665,13 +1674,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_cselect_b32 s0, s2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:  .LBB8_3:
+; GCN-NEXT:  .LBB8_5:
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB8_4:
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_branch .LBB8_2
 ;
 ; TAHITI-LABEL: srem_i64:
 ; TAHITI:       ; %bb.0:
@@ -1686,7 +1692,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TAHITI-NEXT:    s_waitcnt vmcnt(0)
 ; TAHITI-NEXT:    v_or_b32_e32 v5, v1, v3
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB8_4
+; TAHITI-NEXT:    s_cbranch_vccz .LBB8_2
 ; TAHITI-NEXT:  ; %bb.1:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
 ; TAHITI-NEXT:    v_add_i32_e32 v4, vcc, v2, v5
@@ -1699,6 +1705,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TAHITI-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v5, v6, 0x4f800000, v5
 ; TAHITI-NEXT:    v_rcp_f32_e32 v5, v5
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; TAHITI-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; TAHITI-NEXT:    v_trunc_f32_e32 v6, v6
@@ -1805,8 +1812,16 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TAHITI-NEXT:    v_xor_b32_e32 v1, v1, v7
 ; TAHITI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
 ; TAHITI-NEXT:    v_subb_u32_e32 v4, vcc, v1, v7, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB8_3
+; TAHITI-NEXT:    s_branch .LBB8_3
 ; TAHITI-NEXT:  .LBB8_2:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; TAHITI-NEXT:  .LBB8_3: ; %Flow
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB8_5
+; TAHITI-NEXT:  ; %bb.4:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; TAHITI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; TAHITI-NEXT:    v_mov_b32_e32 v4, 0
@@ -1825,14 +1840,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TAHITI-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; TAHITI-NEXT:  .LBB8_3:
+; TAHITI-NEXT:  .LBB8_5:
 ; TAHITI-NEXT:    s_mov_b32 s7, 0xf000
 ; TAHITI-NEXT:    s_mov_b32 s6, -1
 ; TAHITI-NEXT:    buffer_store_dwordx2 v[3:4], off, s[4:7], 0
 ; TAHITI-NEXT:    s_endpgm
-; TAHITI-NEXT:  .LBB8_4:
-; TAHITI-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; TAHITI-NEXT:    s_branch .LBB8_2
 ;
 ; TONGA-LABEL: srem_i64:
 ; TONGA:       ; %bb.0:
@@ -1845,7 +1857,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_or_b32_e32 v5, v1, v3
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; TONGA-NEXT:    s_cbranch_vccz .LBB8_4
+; TONGA-NEXT:    s_cbranch_vccz .LBB8_2
 ; TONGA-NEXT:  ; %bb.1:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v2, v4
@@ -1858,6 +1870,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_subb_u32_e32 v12, vcc, 0, v10, vcc
 ; TONGA-NEXT:    v_madmk_f32 v3, v4, 0x4f800000, v3
 ; TONGA-NEXT:    v_rcp_f32_e32 v3, v3
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; TONGA-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; TONGA-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1952,8 +1965,16 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v7
 ; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
 ; TONGA-NEXT:    v_subb_u32_e32 v4, vcc, v1, v7, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB8_3
+; TONGA-NEXT:    s_branch .LBB8_3
 ; TONGA-NEXT:  .LBB8_2:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; TONGA-NEXT:  .LBB8_3: ; %Flow
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB8_5
+; TONGA-NEXT:  ; %bb.4:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, 0, v2
 ; TONGA-NEXT:    v_mov_b32_e32 v4, 0
@@ -1972,14 +1993,11 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v0, v2
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; TONGA-NEXT:  .LBB8_3:
+; TONGA-NEXT:  .LBB8_5:
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s4
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s5
 ; TONGA-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; TONGA-NEXT:    s_endpgm
-; TONGA-NEXT:  .LBB8_4:
-; TONGA-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; TONGA-NEXT:    s_branch .LBB8_2
 ;
 ; EG-LABEL: srem_i64:
 ; EG:       ; %bb.0:
@@ -2702,7 +2720,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v7
 ; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v6
-; GCN-NEXT:    s_cbranch_scc0 .LBB10_7
+; GCN-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GCN-NEXT:  ; %bb.1:
 ; GCN-NEXT:    s_ashr_i32 s0, s11, 31
 ; GCN-NEXT:    s_add_u32 s2, s10, s0
@@ -2713,6 +2731,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s17
 ; GCN-NEXT:    s_sub_u32 s0, 0, s16
 ; GCN-NEXT:    s_subb_u32 s1, 0, s17
+; GCN-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2841,8 +2860,16 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_mov_b32_e32 v2, s18
 ; GCN-NEXT:    v_subrev_co_u32_e32 v0, vcc, s18, v0
 ; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB10_3
+; GCN-NEXT:    s_branch .LBB10_3
 ; GCN-NEXT:  .LBB10_2:
+; GCN-NEXT:    s_mov_b64 s[14:15], -1
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:  .LBB10_3: ; %Flow5
+; GCN-NEXT:    s_and_b64 s[0:1], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB10_5
+; GCN-NEXT:  ; %bb.4:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    s_sub_i32 s0, 0, s10
 ; GCN-NEXT:    s_mov_b32 s1, 0
@@ -2864,12 +2891,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s0, s2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:  .LBB10_3:
+; GCN-NEXT:  .LBB10_5:
 ; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[6:7]
 ; GCN-NEXT:    s_mov_b32 s0, 0
 ; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT:    s_cbranch_scc0 .LBB10_8
-; GCN-NEXT:  ; %bb.4:
+; GCN-NEXT:    s_cbranch_scc0 .LBB10_7
+; GCN-NEXT:  ; %bb.6:
 ; GCN-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-NEXT:    s_add_u32 s2, s6, s0
 ; GCN-NEXT:    s_mov_b32 s1, s0
@@ -2879,6 +2906,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s13
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
+; GCN-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-NEXT:    v_madmk_f32 v2, v3, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -3007,8 +3035,15 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_mov_b32_e32 v4, s14
 ; GCN-NEXT:    v_subrev_co_u32_e32 v2, vcc, s14, v2
 ; GCN-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB10_6
-; GCN-NEXT:  .LBB10_5:
+; GCN-NEXT:    s_branch .LBB10_8
+; GCN-NEXT:  .LBB10_7:
+; GCN-NEXT:    s_mov_b64 s[10:11], -1
+; GCN-NEXT:  .LBB10_8: ; %Flow
+; GCN-NEXT:    s_and_b64 s[0:1], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB10_10
+; GCN-NEXT:  ; %bb.9:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GCN-NEXT:    s_sub_i32 s0, 0, s6
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -3027,15 +3062,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:  .LBB10_6:
+; GCN-NEXT:  .LBB10_10:
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[4:5]
 ; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB10_7:
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_branch .LBB10_2
-; GCN-NEXT:  .LBB10_8:
-; GCN-NEXT:    s_branch .LBB10_5
 ;
 ; TAHITI-LABEL: srem_v2i64:
 ; TAHITI:       ; %bb.0:
@@ -3051,7 +3081,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    s_waitcnt vmcnt(0)
 ; TAHITI-NEXT:    v_or_b32_e32 v9, v5, v1
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB10_7
+; TAHITI-NEXT:    s_cbranch_vccz .LBB10_2
 ; TAHITI-NEXT:  ; %bb.1:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; TAHITI-NEXT:    v_add_i32_e32 v8, vcc, v0, v9
@@ -3064,6 +3094,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v9, v10, 0x4f800000, v9
 ; TAHITI-NEXT:    v_rcp_f32_e32 v9, v9
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v9, 0x5f7ffffc, v9
 ; TAHITI-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v9
 ; TAHITI-NEXT:    v_trunc_f32_e32 v10, v10
@@ -3170,8 +3201,16 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_xor_b32_e32 v1, v1, v11
 ; TAHITI-NEXT:    v_sub_i32_e32 v8, vcc, v5, v11
 ; TAHITI-NEXT:    v_subb_u32_e32 v9, vcc, v1, v11, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB10_3
+; TAHITI-NEXT:    s_branch .LBB10_3
 ; TAHITI-NEXT:  .LBB10_2:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; TAHITI-NEXT:  .LBB10_3: ; %Flow5
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB10_5
+; TAHITI-NEXT:  ; %bb.4:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; TAHITI-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; TAHITI-NEXT:    v_mov_b32_e32 v9, 0
@@ -3190,12 +3229,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_sub_i32_e32 v4, vcc, v1, v0
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v8, v1, v4, vcc
-; TAHITI-NEXT:  .LBB10_3:
+; TAHITI-NEXT:  .LBB10_5:
 ; TAHITI-NEXT:    v_or_b32_e32 v1, v7, v3
 ; TAHITI-NEXT:    v_mov_b32_e32 v0, 0
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB10_8
-; TAHITI-NEXT:  ; %bb.4:
+; TAHITI-NEXT:    s_cbranch_vccz .LBB10_7
+; TAHITI-NEXT:  ; %bb.6:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; TAHITI-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
 ; TAHITI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
@@ -3207,6 +3246,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subb_u32_e32 v10, vcc, 0, v0, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v3, v4, 0x4f800000, v3
 ; TAHITI-NEXT:    v_rcp_f32_e32 v3, v3
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; TAHITI-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; TAHITI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -3313,8 +3353,15 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; TAHITI-NEXT:    v_sub_i32_e32 v10, vcc, v1, v5
 ; TAHITI-NEXT:    v_subb_u32_e32 v11, vcc, v0, v5, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB10_6
-; TAHITI-NEXT:  .LBB10_5:
+; TAHITI-NEXT:    s_branch .LBB10_8
+; TAHITI-NEXT:  .LBB10_7:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:  .LBB10_8: ; %Flow
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB10_10
+; TAHITI-NEXT:  ; %bb.9:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; TAHITI-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
 ; TAHITI-NEXT:    v_mov_b32_e32 v11, 0
@@ -3333,16 +3380,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v0
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v10, v0, v1, vcc
-; TAHITI-NEXT:  .LBB10_6:
+; TAHITI-NEXT:  .LBB10_10:
 ; TAHITI-NEXT:    s_mov_b32 s7, 0xf000
 ; TAHITI-NEXT:    s_mov_b32 s6, -1
 ; TAHITI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; TAHITI-NEXT:    s_endpgm
-; TAHITI-NEXT:  .LBB10_7:
-; TAHITI-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; TAHITI-NEXT:    s_branch .LBB10_2
-; TAHITI-NEXT:  .LBB10_8:
-; TAHITI-NEXT:    s_branch .LBB10_5
 ;
 ; TONGA-LABEL: srem_v2i64:
 ; TONGA:       ; %bb.0:
@@ -3360,7 +3402,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_or_b32_e32 v9, v5, v1
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; TONGA-NEXT:    s_cbranch_vccz .LBB10_7
+; TONGA-NEXT:    s_cbranch_vccz .LBB10_2
 ; TONGA-NEXT:  ; %bb.1:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
 ; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v0, v8
@@ -3373,6 +3415,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
 ; TONGA-NEXT:    v_madmk_f32 v8, v9, 0x4f800000, v8
 ; TONGA-NEXT:    v_rcp_f32_e32 v8, v8
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
 ; TONGA-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
 ; TONGA-NEXT:    v_trunc_f32_e32 v9, v9
@@ -3467,8 +3510,16 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v12
 ; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v5, v12
 ; TONGA-NEXT:    v_subb_u32_e32 v9, vcc, v1, v12, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB10_3
+; TONGA-NEXT:    s_branch .LBB10_3
 ; TONGA-NEXT:  .LBB10_2:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; TONGA-NEXT:  .LBB10_3: ; %Flow5
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB10_5
+; TONGA-NEXT:  ; %bb.4:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
 ; TONGA-NEXT:    v_mov_b32_e32 v9, 0
@@ -3487,12 +3538,12 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, v1, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
 ; TONGA-NEXT:    v_cndmask_b32_e32 v8, v1, v4, vcc
-; TONGA-NEXT:  .LBB10_3:
+; TONGA-NEXT:  .LBB10_5:
 ; TONGA-NEXT:    v_or_b32_e32 v1, v7, v3
 ; TONGA-NEXT:    v_mov_b32_e32 v0, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; TONGA-NEXT:    s_cbranch_vccz .LBB10_8
-; TONGA-NEXT:  ; %bb.4:
+; TONGA-NEXT:    s_cbranch_vccz .LBB10_7
+; TONGA-NEXT:  ; %bb.6:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
@@ -3504,6 +3555,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subb_u32_e32 v14, vcc, 0, v12, vcc
 ; TONGA-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
@@ -3598,8 +3650,15 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v11
 ; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v0, v11
 ; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v1, v11, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB10_6
-; TONGA-NEXT:  .LBB10_5:
+; TONGA-NEXT:    s_branch .LBB10_8
+; TONGA-NEXT:  .LBB10_7:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:  .LBB10_8: ; %Flow
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB10_10
+; TONGA-NEXT:  ; %bb.9:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, 0, v2
 ; TONGA-NEXT:    v_mov_b32_e32 v11, 0
@@ -3618,16 +3677,11 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v10, v0, v1, vcc
-; TONGA-NEXT:  .LBB10_6:
+; TONGA-NEXT:  .LBB10_10:
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s4
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s5
 ; TONGA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; TONGA-NEXT:    s_endpgm
-; TONGA-NEXT:  .LBB10_7:
-; TONGA-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; TONGA-NEXT:    s_branch .LBB10_2
-; TONGA-NEXT:  .LBB10_8:
-; TONGA-NEXT:    s_branch .LBB10_5
 ;
 ; EG-LABEL: srem_v2i64:
 ; EG:       ; %bb.0:
@@ -4876,7 +4930,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_or_b64 s[0:1], s[8:9], s[6:7]
 ; GCN-NEXT:    s_mov_b32 s0, 0
 ; GCN-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GCN-NEXT:    s_cbranch_scc0 .LBB12_13
+; GCN-NEXT:    s_cbranch_scc0 .LBB12_2
 ; GCN-NEXT:  ; %bb.1:
 ; GCN-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-NEXT:    s_add_u32 s2, s6, s0
@@ -4887,6 +4941,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s13
 ; GCN-NEXT:    s_sub_u32 s0, 0, s12
 ; GCN-NEXT:    s_subb_u32 s1, 0, s13
+; GCN-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-NEXT:    v_madmk_f32 v8, v9, 0x4f800000, v8
 ; GCN-NEXT:    v_rcp_f32_e32 v8, v8
 ; GCN-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
@@ -5015,8 +5070,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_mov_b32_e32 v10, s14
 ; GCN-NEXT:    v_subrev_co_u32_e32 v8, vcc, s14, v8
 ; GCN-NEXT:    v_subb_co_u32_e32 v9, vcc, v9, v10, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB12_3
+; GCN-NEXT:    s_branch .LBB12_3
 ; GCN-NEXT:  .LBB12_2:
+; GCN-NEXT:    s_mov_b64 s[10:11], -1
+; GCN-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GCN-NEXT:  .LBB12_3: ; %Flow7
+; GCN-NEXT:    s_and_b64 s[0:1], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB12_5
+; GCN-NEXT:  ; %bb.4:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v8, s6
 ; GCN-NEXT:    s_sub_i32 s0, 0, s6
 ; GCN-NEXT:    s_mov_b32 s1, 0
@@ -5038,12 +5101,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s0, s2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v9, s1
 ; GCN-NEXT:    v_mov_b32_e32 v8, s0
-; GCN-NEXT:  .LBB12_3:
+; GCN-NEXT:  .LBB12_5:
 ; GCN-NEXT:    v_or_b32_e32 v11, v17, v13
 ; GCN-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT:    s_cbranch_vccz .LBB12_14
-; GCN-NEXT:  ; %bb.4:
+; GCN-NEXT:    s_cbranch_vccz .LBB12_7
+; GCN-NEXT:  ; %bb.6:
 ; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
 ; GCN-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v10
 ; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v10, vcc
@@ -5055,6 +5118,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_subb_co_u32_e32 v18, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_madmk_f32 v13, v14, 0x4f800000, v13
 ; GCN-NEXT:    v_rcp_f32_e32 v13, v13
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    v_mul_f32_e32 v13, 0x5f7ffffc, v13
 ; GCN-NEXT:    v_mul_f32_e32 v14, 0x2f800000, v13
 ; GCN-NEXT:    v_trunc_f32_e32 v14, v14
@@ -5161,8 +5225,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_xor_b32_e32 v13, v10, v15
 ; GCN-NEXT:    v_sub_co_u32_e32 v10, vcc, v11, v15
 ; GCN-NEXT:    v_subb_co_u32_e32 v11, vcc, v13, v15, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB12_6
-; GCN-NEXT:  .LBB12_5:
+; GCN-NEXT:    s_branch .LBB12_8
+; GCN-NEXT:  .LBB12_7:
+; GCN-NEXT:    s_mov_b64 s[6:7], -1
+; GCN-NEXT:  .LBB12_8: ; %Flow6
+; GCN-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB12_10
+; GCN-NEXT:  ; %bb.9:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v12
 ; GCN-NEXT:    v_sub_u32_e32 v11, 0, v12
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
@@ -5181,13 +5252,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v12
 ; GCN-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:  .LBB12_6:
+; GCN-NEXT:  .LBB12_10:
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_or_b32_e32 v13, v5, v1
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; GCN-NEXT:    s_cbranch_vccz .LBB12_15
-; GCN-NEXT:  ; %bb.7:
+; GCN-NEXT:    s_cbranch_vccz .LBB12_12
+; GCN-NEXT:  ; %bb.11:
 ; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
 ; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, v0, v13
 ; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v13, vcc
@@ -5199,6 +5270,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_subb_co_u32_e32 v16, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_madmk_f32 v13, v14, 0x4f800000, v13
 ; GCN-NEXT:    v_rcp_f32_e32 v13, v13
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    v_mul_f32_e32 v13, 0x5f7ffffc, v13
 ; GCN-NEXT:    v_mul_f32_e32 v14, 0x2f800000, v13
 ; GCN-NEXT:    v_trunc_f32_e32 v14, v14
@@ -5305,8 +5377,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v15
 ; GCN-NEXT:    v_sub_co_u32_e32 v12, vcc, v5, v15
 ; GCN-NEXT:    v_subb_co_u32_e32 v13, vcc, v1, v15, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB12_9
-; GCN-NEXT:  .LBB12_8:
+; GCN-NEXT:    s_branch .LBB12_13
+; GCN-NEXT:  .LBB12_12:
+; GCN-NEXT:    s_mov_b64 s[6:7], -1
+; GCN-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; GCN-NEXT:  .LBB12_13: ; %Flow5
+; GCN-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB12_15
+; GCN-NEXT:  ; %bb.14:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    v_sub_u32_e32 v5, 0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v13, 0
@@ -5325,12 +5405,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_sub_u32_e32 v4, v1, v0
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v12, v1, v4, vcc
-; GCN-NEXT:  .LBB12_9:
+; GCN-NEXT:  .LBB12_15:
 ; GCN-NEXT:    v_or_b32_e32 v1, v7, v3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    s_cbranch_vccz .LBB12_16
-; GCN-NEXT:  ; %bb.10:
+; GCN-NEXT:    s_cbranch_vccz .LBB12_17
+; GCN-NEXT:  ; %bb.16:
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GCN-NEXT:    v_add_co_u32_e32 v1, vcc, v2, v0
 ; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v0, vcc
@@ -5342,6 +5422,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v0, vcc
 ; GCN-NEXT:    v_madmk_f32 v3, v4, 0x4f800000, v3
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
@@ -5448,8 +5529,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; GCN-NEXT:    v_sub_co_u32_e32 v14, vcc, v1, v5
 ; GCN-NEXT:    v_subb_co_u32_e32 v15, vcc, v0, v5, vcc
-; GCN-NEXT:    s_cbranch_execnz .LBB12_12
-; GCN-NEXT:  .LBB12_11:
+; GCN-NEXT:    s_branch .LBB12_18
+; GCN-NEXT:  .LBB12_17:
+; GCN-NEXT:    s_mov_b64 s[6:7], -1
+; GCN-NEXT:  .LBB12_18: ; %Flow
+; GCN-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB12_20
+; GCN-NEXT:  ; %bb.19:
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; GCN-NEXT:    v_sub_u32_e32 v1, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v15, 0
@@ -5468,21 +5556,11 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_sub_u32_e32 v1, v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v14, v0, v1, vcc
-; GCN-NEXT:  .LBB12_12:
+; GCN-NEXT:  .LBB12_20:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dwordx4 v0, v[12:15], s[4:5] offset:16
 ; GCN-NEXT:    global_store_dwordx4 v0, v[8:11], s[4:5]
 ; GCN-NEXT:    s_endpgm
-; GCN-NEXT:  .LBB12_13:
-; GCN-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GCN-NEXT:    s_branch .LBB12_2
-; GCN-NEXT:  .LBB12_14:
-; GCN-NEXT:    s_branch .LBB12_5
-; GCN-NEXT:  .LBB12_15:
-; GCN-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; GCN-NEXT:    s_branch .LBB12_8
-; GCN-NEXT:  .LBB12_16:
-; GCN-NEXT:    s_branch .LBB12_11
 ;
 ; TAHITI-LABEL: srem_v4i64:
 ; TAHITI:       ; %bb.0:
@@ -5500,7 +5578,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    s_waitcnt vmcnt(2)
 ; TAHITI-NEXT:    v_or_b32_e32 v9, v15, v11
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB12_13
+; TAHITI-NEXT:    s_cbranch_vccz .LBB12_2
 ; TAHITI-NEXT:  ; %bb.1:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v8, 31, v11
 ; TAHITI-NEXT:    v_add_i32_e32 v9, vcc, v10, v8
@@ -5513,6 +5591,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subb_u32_e32 v20, vcc, 0, v8, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v11, v18, 0x4f800000, v11
 ; TAHITI-NEXT:    v_rcp_f32_e32 v11, v11
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v11, 0x5f7ffffc, v11
 ; TAHITI-NEXT:    v_mul_f32_e32 v18, 0x2f800000, v11
 ; TAHITI-NEXT:    v_trunc_f32_e32 v18, v18
@@ -5619,8 +5698,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_xor_b32_e32 v11, v8, v19
 ; TAHITI-NEXT:    v_sub_i32_e32 v8, vcc, v9, v19
 ; TAHITI-NEXT:    v_subb_u32_e32 v9, vcc, v11, v19, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB12_3
+; TAHITI-NEXT:    s_branch .LBB12_3
 ; TAHITI-NEXT:  .LBB12_2:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; TAHITI-NEXT:  .LBB12_3: ; %Flow7
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB12_5
+; TAHITI-NEXT:  ; %bb.4:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v8, v10
 ; TAHITI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v10
 ; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
@@ -5639,12 +5726,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v10
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; TAHITI-NEXT:    v_mov_b32_e32 v9, 0
-; TAHITI-NEXT:  .LBB12_3:
+; TAHITI-NEXT:  .LBB12_5:
 ; TAHITI-NEXT:    v_or_b32_e32 v11, v17, v13
 ; TAHITI-NEXT:    v_mov_b32_e32 v10, 0
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB12_14
-; TAHITI-NEXT:  ; %bb.4:
+; TAHITI-NEXT:    s_cbranch_vccz .LBB12_7
+; TAHITI-NEXT:  ; %bb.6:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
 ; TAHITI-NEXT:    v_add_i32_e32 v11, vcc, v12, v10
 ; TAHITI-NEXT:    v_addc_u32_e32 v13, vcc, v13, v10, vcc
@@ -5656,6 +5743,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subb_u32_e32 v18, vcc, 0, v10, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v13, v14, 0x4f800000, v13
 ; TAHITI-NEXT:    v_rcp_f32_e32 v13, v13
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v13, 0x5f7ffffc, v13
 ; TAHITI-NEXT:    v_mul_f32_e32 v14, 0x2f800000, v13
 ; TAHITI-NEXT:    v_trunc_f32_e32 v14, v14
@@ -5762,8 +5850,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_xor_b32_e32 v13, v10, v15
 ; TAHITI-NEXT:    v_sub_i32_e32 v10, vcc, v11, v15
 ; TAHITI-NEXT:    v_subb_u32_e32 v11, vcc, v13, v15, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB12_6
-; TAHITI-NEXT:  .LBB12_5:
+; TAHITI-NEXT:    s_branch .LBB12_8
+; TAHITI-NEXT:  .LBB12_7:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:  .LBB12_8: ; %Flow6
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB12_10
+; TAHITI-NEXT:  ; %bb.9:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v10, v12
 ; TAHITI-NEXT:    v_sub_i32_e32 v11, vcc, 0, v12
 ; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
@@ -5782,13 +5877,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v12
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; TAHITI-NEXT:    v_mov_b32_e32 v11, 0
-; TAHITI-NEXT:  .LBB12_6:
+; TAHITI-NEXT:  .LBB12_10:
 ; TAHITI-NEXT:    s_waitcnt vmcnt(0)
 ; TAHITI-NEXT:    v_or_b32_e32 v13, v5, v1
 ; TAHITI-NEXT:    v_mov_b32_e32 v12, 0
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB12_15
-; TAHITI-NEXT:  ; %bb.7:
+; TAHITI-NEXT:    s_cbranch_vccz .LBB12_12
+; TAHITI-NEXT:  ; %bb.11:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
 ; TAHITI-NEXT:    v_add_i32_e32 v12, vcc, v0, v13
 ; TAHITI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v13, vcc
@@ -5800,6 +5895,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v13, v14, 0x4f800000, v13
 ; TAHITI-NEXT:    v_rcp_f32_e32 v13, v13
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v13, 0x5f7ffffc, v13
 ; TAHITI-NEXT:    v_mul_f32_e32 v14, 0x2f800000, v13
 ; TAHITI-NEXT:    v_trunc_f32_e32 v14, v14
@@ -5906,8 +6002,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_xor_b32_e32 v1, v1, v15
 ; TAHITI-NEXT:    v_sub_i32_e32 v12, vcc, v5, v15
 ; TAHITI-NEXT:    v_subb_u32_e32 v13, vcc, v1, v15, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB12_9
-; TAHITI-NEXT:  .LBB12_8:
+; TAHITI-NEXT:    s_branch .LBB12_13
+; TAHITI-NEXT:  .LBB12_12:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; TAHITI-NEXT:  .LBB12_13: ; %Flow5
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB12_15
+; TAHITI-NEXT:  ; %bb.14:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; TAHITI-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; TAHITI-NEXT:    v_mov_b32_e32 v13, 0
@@ -5926,12 +6030,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v1
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v12, v1, v4, vcc
-; TAHITI-NEXT:  .LBB12_9:
+; TAHITI-NEXT:  .LBB12_15:
 ; TAHITI-NEXT:    v_or_b32_e32 v1, v7, v3
 ; TAHITI-NEXT:    v_mov_b32_e32 v0, 0
 ; TAHITI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; TAHITI-NEXT:    s_cbranch_vccz .LBB12_16
-; TAHITI-NEXT:  ; %bb.10:
+; TAHITI-NEXT:    s_cbranch_vccz .LBB12_17
+; TAHITI-NEXT:  ; %bb.16:
 ; TAHITI-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; TAHITI-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
 ; TAHITI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
@@ -5943,6 +6047,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subb_u32_e32 v14, vcc, 0, v0, vcc
 ; TAHITI-NEXT:    v_madmk_f32 v3, v4, 0x4f800000, v3
 ; TAHITI-NEXT:    v_rcp_f32_e32 v3, v3
+; TAHITI-NEXT:    s_mov_b64 s[6:7], 0
 ; TAHITI-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; TAHITI-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; TAHITI-NEXT:    v_trunc_f32_e32 v4, v4
@@ -6049,8 +6154,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_xor_b32_e32 v0, v0, v5
 ; TAHITI-NEXT:    v_sub_i32_e32 v14, vcc, v1, v5
 ; TAHITI-NEXT:    v_subb_u32_e32 v15, vcc, v0, v5, vcc
-; TAHITI-NEXT:    s_cbranch_execnz .LBB12_12
-; TAHITI-NEXT:  .LBB12_11:
+; TAHITI-NEXT:    s_branch .LBB12_18
+; TAHITI-NEXT:  .LBB12_17:
+; TAHITI-NEXT:    s_mov_b64 s[6:7], -1
+; TAHITI-NEXT:  .LBB12_18: ; %Flow
+; TAHITI-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TAHITI-NEXT:    s_cselect_b32 s0, 1, 0
+; TAHITI-NEXT:    s_cmp_lg_u32 s0, 1
+; TAHITI-NEXT:    s_cbranch_scc1 .LBB12_20
+; TAHITI-NEXT:  ; %bb.19:
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; TAHITI-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
 ; TAHITI-NEXT:    v_mov_b32_e32 v15, 0
@@ -6069,22 +6181,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v0
 ; TAHITI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TAHITI-NEXT:    v_cndmask_b32_e32 v14, v0, v1, vcc
-; TAHITI-NEXT:  .LBB12_12:
+; TAHITI-NEXT:  .LBB12_20:
 ; TAHITI-NEXT:    s_mov_b32 s7, 0xf000
 ; TAHITI-NEXT:    s_mov_b32 s6, -1
 ; TAHITI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
 ; TAHITI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; TAHITI-NEXT:    s_endpgm
-; TAHITI-NEXT:  .LBB12_13:
-; TAHITI-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; TAHITI-NEXT:    s_branch .LBB12_2
-; TAHITI-NEXT:  .LBB12_14:
-; TAHITI-NEXT:    s_branch .LBB12_5
-; TAHITI-NEXT:  .LBB12_15:
-; TAHITI-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; TAHITI-NEXT:    s_branch .LBB12_8
-; TAHITI-NEXT:  .LBB12_16:
-; TAHITI-NEXT:    s_branch .LBB12_11
 ;
 ; TONGA-LABEL: srem_v4i64:
 ; TONGA:       ; %bb.0:
@@ -6112,7 +6214,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_waitcnt vmcnt(2)
 ; TONGA-NEXT:    v_or_b32_e32 v9, v15, v11
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_13
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_2
 ; TONGA-NEXT:  ; %bb.1:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v11
 ; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v10, v8
@@ -6125,6 +6227,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subb_u32_e32 v24, vcc, 0, v11, vcc
 ; TONGA-NEXT:    v_madmk_f32 v8, v9, 0x4f800000, v8
 ; TONGA-NEXT:    v_rcp_f32_e32 v8, v8
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
 ; TONGA-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
 ; TONGA-NEXT:    v_trunc_f32_e32 v9, v9
@@ -6219,8 +6322,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v9, v9, v20
 ; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v8, v20
 ; TONGA-NEXT:    v_subb_u32_e32 v9, vcc, v9, v20, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_3
+; TONGA-NEXT:    s_branch .LBB12_3
 ; TONGA-NEXT:  .LBB12_2:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; TONGA-NEXT:  .LBB12_3: ; %Flow7
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB12_5
+; TONGA-NEXT:  ; %bb.4:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v10
 ; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v10
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
@@ -6239,12 +6350,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v10
 ; TONGA-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; TONGA-NEXT:    v_mov_b32_e32 v9, 0
-; TONGA-NEXT:  .LBB12_3:
+; TONGA-NEXT:  .LBB12_5:
 ; TONGA-NEXT:    v_or_b32_e32 v11, v17, v13
 ; TONGA-NEXT:    v_mov_b32_e32 v10, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_14
-; TONGA-NEXT:  ; %bb.4:
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_7
+; TONGA-NEXT:  ; %bb.6:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v12, v10
 ; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, v13, v10, vcc
@@ -6256,6 +6367,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subb_u32_e32 v22, vcc, 0, v20, vcc
 ; TONGA-NEXT:    v_madmk_f32 v10, v11, 0x4f800000, v10
 ; TONGA-NEXT:    v_rcp_f32_e32 v10, v10
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
 ; TONGA-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
 ; TONGA-NEXT:    v_trunc_f32_e32 v11, v11
@@ -6350,8 +6462,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v11, v11, v18
 ; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v10, v18
 ; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v11, v18, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_6
-; TONGA-NEXT:  .LBB12_5:
+; TONGA-NEXT:    s_branch .LBB12_8
+; TONGA-NEXT:  .LBB12_7:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:  .LBB12_8: ; %Flow6
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB12_10
+; TONGA-NEXT:  ; %bb.9:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v12
 ; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v12
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
@@ -6370,13 +6489,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v12
 ; TONGA-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; TONGA-NEXT:    v_mov_b32_e32 v11, 0
-; TONGA-NEXT:  .LBB12_6:
+; TONGA-NEXT:  .LBB12_10:
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_or_b32_e32 v13, v5, v1
 ; TONGA-NEXT:    v_mov_b32_e32 v12, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_15
-; TONGA-NEXT:  ; %bb.7:
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_12
+; TONGA-NEXT:  ; %bb.11:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
 ; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v0, v12
 ; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
@@ -6388,6 +6507,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subb_u32_e32 v20, vcc, 0, v1, vcc
 ; TONGA-NEXT:    v_madmk_f32 v12, v13, 0x4f800000, v12
 ; TONGA-NEXT:    v_rcp_f32_e32 v12, v12
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v12, 0x5f7ffffc, v12
 ; TONGA-NEXT:    v_mul_f32_e32 v13, 0x2f800000, v12
 ; TONGA-NEXT:    v_trunc_f32_e32 v13, v13
@@ -6482,8 +6602,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v16
 ; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v5, v16
 ; TONGA-NEXT:    v_subb_u32_e32 v13, vcc, v1, v16, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_9
-; TONGA-NEXT:  .LBB12_8:
+; TONGA-NEXT:    s_branch .LBB12_13
+; TONGA-NEXT:  .LBB12_12:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; TONGA-NEXT:  .LBB12_13: ; %Flow5
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB12_15
+; TONGA-NEXT:  ; %bb.14:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
 ; TONGA-NEXT:    v_mov_b32_e32 v13, 0
@@ -6502,12 +6630,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v0, v1
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
 ; TONGA-NEXT:    v_cndmask_b32_e32 v12, v1, v4, vcc
-; TONGA-NEXT:  .LBB12_9:
+; TONGA-NEXT:  .LBB12_15:
 ; TONGA-NEXT:    v_or_b32_e32 v1, v7, v3
 ; TONGA-NEXT:    v_mov_b32_e32 v0, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_16
-; TONGA-NEXT:  ; %bb.10:
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_17
+; TONGA-NEXT:  ; %bb.16:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
@@ -6519,6 +6647,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subb_u32_e32 v18, vcc, 0, v16, vcc
 ; TONGA-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
+; TONGA-NEXT:    s_mov_b64 s[6:7], 0
 ; TONGA-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
@@ -6613,8 +6742,15 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v15
 ; TONGA-NEXT:    v_sub_u32_e32 v14, vcc, v0, v15
 ; TONGA-NEXT:    v_subb_u32_e32 v15, vcc, v1, v15, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_12
-; TONGA-NEXT:  .LBB12_11:
+; TONGA-NEXT:    s_branch .LBB12_18
+; TONGA-NEXT:  .LBB12_17:
+; TONGA-NEXT:    s_mov_b64 s[6:7], -1
+; TONGA-NEXT:  .LBB12_18: ; %Flow
+; TONGA-NEXT:    s_and_b64 s[0:1], s[6:7], exec
+; TONGA-NEXT:    s_cselect_b32 s0, 1, 0
+; TONGA-NEXT:    s_cmp_lg_u32 s0, 1
+; TONGA-NEXT:    s_cbranch_scc1 .LBB12_20
+; TONGA-NEXT:  ; %bb.19:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, 0, v2
 ; TONGA-NEXT:    v_mov_b32_e32 v15, 0
@@ -6633,7 +6769,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v14, v0, v1, vcc
-; TONGA-NEXT:  .LBB12_12:
+; TONGA-NEXT:  .LBB12_20:
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s4
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s5
 ; TONGA-NEXT:    s_add_u32 s0, s4, 16
@@ -6643,16 +6779,6 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s1
 ; TONGA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
 ; TONGA-NEXT:    s_endpgm
-; TONGA-NEXT:  .LBB12_13:
-; TONGA-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; TONGA-NEXT:    s_branch .LBB12_2
-; TONGA-NEXT:  .LBB12_14:
-; TONGA-NEXT:    s_branch .LBB12_5
-; TONGA-NEXT:  .LBB12_15:
-; TONGA-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; TONGA-NEXT:    s_branch .LBB12_8
-; TONGA-NEXT:  .LBB12_16:
-; TONGA-NEXT:    s_branch .LBB12_11
 ;
 ; EG-LABEL: srem_v4i64:
 ; EG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 8498e9af46f2b5..9d2ae450e8360f 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -125,7 +125,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
@@ -141,16 +140,21 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s11, 1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s12
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s14
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
@@ -1051,7 +1055,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 31
@@ -1079,17 +1082,22 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
+; GCN-IR-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s15, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s15, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s14
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GCN-IR-NEXT:    s_cmp_lg_u32 s15, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB8_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s16
 ; GCN-IR-NEXT:    s_add_u32 s18, s8, -1
@@ -1190,7 +1198,6 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-LABEL: s_test_srem24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s7
@@ -1224,17 +1231,22 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s5
 ; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s4
 ; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    s_and_b64 s[16:17], s[16:17], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
+; GCN-IR-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s15, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[4:5], s14
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s15, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s14
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
+; GCN-IR-NEXT:    s_cmp_lg_u32 s15, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB9_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[4:5], s16
 ; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
@@ -1429,17 +1441,21 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_and_b64 s[8:9], s[10:11], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
 ; GCN-IR-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], 24, s2
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB10_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s8
 ; GCN-IR-NEXT:    s_add_u32 s14, s4, -1
diff --git a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
index e56226f4e48c85..186c7626295d9f 100644
--- a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
@@ -5,9 +5,12 @@
 ; GCN-LABEL: sub_zext_zext:
 ; GCN: ds_read_b32 [[VAL:v[0-9]+]],
 ; GCN: v_cmp_lt_f32{{.*}} vcc, 0, [[VAL]]
-; GCN: v_cndmask_{{.*}} [[ZEXTCC1:v[0-9]+]], 0, 1, vcc
-; GCN: v_cmp_gt_f32{{.*}} vcc, 0, [[VAL]]
-; GCN: v_subbrev{{.*}} {{v[0-9]+}}, vcc, 0, [[ZEXTCC1]], vcc
+; GCN: v_cmp_gt_f32{{.*}} s[0:1], 0, v0
+; GCN: s_and_b64 s[2:3], vcc, exec
+; GCN: s_cselect_b32 s2, 1, 0
+; GCN: s_cmp_lg_u64 s[0:1], 0
+; GCN: s_subb_u32 s0, s2, 0
+; GCN: v_cvt_f32_i32_e32 v0, s0
 ;
 ; Before the reversion that this test is attached to, the compiler commuted
 ; the operands to the sub and used different logic to select the addc/subc
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index e5bc4c5721b904..a51cec5a0680d3 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -23,8 +23,8 @@ define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(ptr addrspace(1) %o
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[TMP]]{{$}}
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI: buffer_store_byte [[RESULT]]
+; SI-NEXT: s_and_b64 {{s\[[0-9]+:[0-9]+\]}}, [[NEG]], exec
+; SI: buffer_store_byte
 define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
@@ -120,8 +120,8 @@ define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(ptr addrspace(1) %o
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[TMP]]{{$}}
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
-; SI: buffer_store_byte [[RESULT]]
+; SI-NEXT: s_and_b64 {{s\[[0-9]+:[0-9]+\]}}, [[NEG]], exec
+; SI: buffer_store_byte
 define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   %load = load i1, ptr addrspace(1) %in
   %ext = zext i1 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 837b46f0ce578d..862dcfc06c0a59 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -32,7 +32,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[72:75], s[6:7], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[56:59], s[6:7], 0x0
 ; GLOBALNESS1-NEXT:    s_nop 0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[6:7], 0x14
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v41, v0
@@ -40,64 +40,51 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[72:73]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
-; GLOBALNESS1-NEXT:    s_load_dword s7, s[38:39], 0x20
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[56:57]
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s15
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
+; GLOBALNESS1-NEXT:    s_load_dword s7, s[38:39], 0x20
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s58, 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[56:57], -1, 0
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
+; GLOBALNESS1-NEXT:    s_xor_b64 s[60:61], s[56:57], -1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s74, 0
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[40:41], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[0:1]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[54:55], s[4:5], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_xor_b64 s[62:63], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS1-NEXT:    s_mov_b32 s68, s14
-; GLOBALNESS1-NEXT:    s_mov_b32 s69, s13
-; GLOBALNESS1-NEXT:    s_mov_b32 s70, s12
+; GLOBALNESS1-NEXT:    s_xor_b64 s[64:65], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_mov_b32 s50, s14
+; GLOBALNESS1-NEXT:    s_mov_b32 s51, s13
+; GLOBALNESS1-NEXT:    s_mov_b32 s52, s12
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v2
+; GLOBALNESS1-NEXT:    v_readfirstlane_b32 s4, v2
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s4, 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[66:67], -1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s4, 1
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[68:69], -1, 0
+; GLOBALNESS1-NEXT:    s_cmp_eq_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[70:71], -1, 0
+; GLOBALNESS1-NEXT:    s_cmp_eq_u32 s4, 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[72:73], -1, 0
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
+; GLOBALNESS1-NEXT:    s_and_b64 s[6:7], s[72:73], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s6, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s6, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc0 .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -107,10 +94,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_31
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
+; GLOBALNESS1-NEXT:    ; Child Loop BB1_17 Depth 2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0x80
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
@@ -124,202 +111,231 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s52
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s51
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s50
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[44:45]
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[56:57], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_10
 ; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s75, 1
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s59, 1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock12
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s75, 1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s59, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_8
-; GLOBALNESS1-NEXT:    s_branch .LBB1_9
+; GLOBALNESS1-NEXT:    s_branch .LBB1_8
 ; GLOBALNESS1-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], -1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS1-NEXT:  .LBB1_8: ; %Flow26
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s75, 0
+; GLOBALNESS1-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s8, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s8, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_10
+; GLOBALNESS1-NEXT:  ; %bb.9: ; %LeafBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s59, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS1-NEXT:  .LBB1_10: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS1-NEXT:  ; %bb.11: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[60:61], 0, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[42:43], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[72:73], s[60:61]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
-; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[74:75], s[42:43]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_27
+; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
-; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[66:67], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.13: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
-; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[62:63], 0, v2
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
-; GLOBALNESS1-NEXT:    s_branch .LBB1_16
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[44:45], 0, v[0:1]
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[46:47], 0, v2
+; GLOBALNESS1-NEXT:    s_branch .LBB1_17
+; GLOBALNESS1-NEXT:  .LBB1_15: ; %Flow16
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
-; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
+; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb63.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[64:65], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc0 .LBB1_26
+; GLOBALNESS1-NEXT:  .LBB1_17: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[40:41]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[42:43]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[64:65]
-; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_add_u32 s66, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s67, s39, 0
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[60:61], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb46.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[62:63], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb50.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[40:41], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_22
+; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb3.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[54:55], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_22
+; GLOBALNESS1-NEXT:  ; %bb.21: ; %bb6.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[44:45], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:  .LBB1_22: ; %spam.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[68:69], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb55.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_add_u32 s48, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s49, s39, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS1-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[76:77], s[4:5], 0x0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[66:67]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s52
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s51
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s50
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[36:37]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[66:67]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[48:49]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s52
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s51
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s50
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[46:47]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.24: ; %bb62.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
-; GLOBALNESS1-NEXT:    s_branch .LBB1_14
-; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_branch .LBB1_15
+; GLOBALNESS1-NEXT:  .LBB1_25: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_3
-; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow23
+; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow23
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow24
+; GLOBALNESS1-NEXT:  .LBB1_27: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[72:73]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[60:61]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[74:75]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[42:43]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
-; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
+; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
-; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
+; GLOBALNESS1-NEXT:    s_and_b64 s[6:7], s[70:71], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s6, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s6, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_1
+; GLOBALNESS1-NEXT:  ; %bb.29: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
-; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
+; GLOBALNESS1-NEXT:  .LBB1_30: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
-; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:  .LBB1_31: ; %loop.exit.guard
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_32
-; GLOBALNESS1-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS1-NEXT:    s_cbranch_scc0 .LBB1_33
+; GLOBALNESS1-NEXT:  ; %bb.32: ; %bb7.i.i
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s52
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s51
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s50
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, widget at rel32@hi+12
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS1-NEXT:  .LBB1_32: ; %Flow
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_34
-; GLOBALNESS1-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS1-NEXT:  .LBB1_33: ; %Flow
+; GLOBALNESS1-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GLOBALNESS1-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_35
+; GLOBALNESS1-NEXT:  ; %bb.34: ; %bb11.i.i
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[36:37]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s70
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s69
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s68
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s52
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s51
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s50
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, widget at rel32@hi+12
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GLOBALNESS1-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
+; GLOBALNESS1-NEXT:  .LBB1_35: ; %UnifiedUnreachableBlock
 ;
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[6:7]
-; GLOBALNESS0-NEXT:    s_load_dwordx4 s[72:75], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[52:55], s[6:7], 0x0
 ; GLOBALNESS0-NEXT:    s_nop 0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[6:7], 0x14
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v41, v0
@@ -327,64 +343,51 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[4:5]
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
-; GLOBALNESS0-NEXT:    s_load_dword s7, s[38:39], 0x20
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[52:53]
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s15
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[38:39], 0x18
+; GLOBALNESS0-NEXT:    s_load_dword s7, s[38:39], 0x20
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s54, 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[58:59], -1, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
+; GLOBALNESS0-NEXT:    s_xor_b64 s[60:61], s[58:59], -1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
-; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[40:41], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[0:1]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[56:57], s[4:5], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[62:63], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v3
-; GLOBALNESS0-NEXT:    s_mov_b32 s66, s14
-; GLOBALNESS0-NEXT:    s_mov_b32 s67, s13
-; GLOBALNESS0-NEXT:    s_mov_b32 s68, s12
+; GLOBALNESS0-NEXT:    s_xor_b64 s[64:65], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_mov_b32 s48, s14
+; GLOBALNESS0-NEXT:    s_mov_b32 s49, s13
+; GLOBALNESS0-NEXT:    s_mov_b32 s50, s12
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v2
+; GLOBALNESS0-NEXT:    v_readfirstlane_b32 s4, v2
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s4, 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[66:67], -1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s4, 1
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[68:69], -1, 0
+; GLOBALNESS0-NEXT:    s_cmp_eq_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[70:71], -1, 0
+; GLOBALNESS0-NEXT:    s_cmp_eq_u32 s4, 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[72:73], -1, 0
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
+; GLOBALNESS0-NEXT:    s_and_b64 s[6:7], s[72:73], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s6, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s6, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc0 .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -394,10 +397,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_31
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
+; GLOBALNESS0-NEXT:    ; Child Loop BB1_17 Depth 2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0x80
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
@@ -411,197 +414,226 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s68
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s67
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s66
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s48
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[44:45]
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[58:59], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_10
 ; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s75, 1
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s55, 1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock12
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_8
-; GLOBALNESS0-NEXT:    s_branch .LBB1_9
+; GLOBALNESS0-NEXT:    s_branch .LBB1_8
 ; GLOBALNESS0-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], -1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS0-NEXT:  .LBB1_8: ; %Flow26
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
+; GLOBALNESS0-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s8, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s8, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_10
+; GLOBALNESS0-NEXT:  ; %bb.9: ; %LeafBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s55, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
+; GLOBALNESS0-NEXT:  .LBB1_10: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS0-NEXT:  ; %bb.11: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[60:61], 0, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[42:43], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[72:73], s[60:61]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
-; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[74:75], s[42:43]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_27
+; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
-; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[66:67], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.13: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
-; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[62:63], 0, v2
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[64:65], 1, v0
-; GLOBALNESS0-NEXT:    s_branch .LBB1_16
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[44:45], 0, v[0:1]
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[46:47], 0, v2
+; GLOBALNESS0-NEXT:    s_branch .LBB1_17
+; GLOBALNESS0-NEXT:  .LBB1_15: ; %Flow16
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
-; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
+; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb63.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[64:65], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc0 .LBB1_26
+; GLOBALNESS0-NEXT:  .LBB1_17: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[40:41]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[42:43]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[64:65]
-; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s70, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s71, s39, 0
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[60:61], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb46.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[62:63], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb50.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[40:41], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_22
+; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb3.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[56:57], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_22
+; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb6.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[44:45], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:  .LBB1_22: ; %spam.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[68:69], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb55.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_add_u32 s52, s38, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s53, s39, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[4:5]
 ; GLOBALNESS0-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[76:77], s[4:5], 0x0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[70:71]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s68
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s67
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s66
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s48
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[70:71]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[52:53]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s68
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s67
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s66
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s48
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[46:47]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.24: ; %bb62.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
-; GLOBALNESS0-NEXT:    s_branch .LBB1_14
-; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_branch .LBB1_15
+; GLOBALNESS0-NEXT:  .LBB1_25: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_3
-; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow23
+; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow23
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow24
+; GLOBALNESS0-NEXT:  .LBB1_27: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[72:73]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[60:61]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[74:75]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[42:43]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
-; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
+; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
-; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
+; GLOBALNESS0-NEXT:    s_and_b64 s[6:7], s[70:71], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s6, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s6, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_1
+; GLOBALNESS0-NEXT:  ; %bb.29: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
-; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
+; GLOBALNESS0-NEXT:  .LBB1_30: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
-; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:  .LBB1_31: ; %loop.exit.guard
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_32
-; GLOBALNESS0-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS0-NEXT:    s_cbranch_scc0 .LBB1_33
+; GLOBALNESS0-NEXT:  ; %bb.32: ; %bb7.i.i
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s68
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s67
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s66
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s48
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, widget at rel32@hi+12
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS0-NEXT:  .LBB1_32: ; %Flow
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_34
-; GLOBALNESS0-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS0-NEXT:  .LBB1_33: ; %Flow
+; GLOBALNESS0-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GLOBALNESS0-NEXT:    s_cselect_b32 s4, 1, 0
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s4, 1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_35
+; GLOBALNESS0-NEXT:  ; %bb.34: ; %bb11.i.i
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s68
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s67
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s66
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s48
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, widget at rel32@hi+12
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GLOBALNESS0-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
+; GLOBALNESS0-NEXT:  .LBB1_35: ; %UnifiedUnreachableBlock
 bb:
   store i32 0, ptr addrspace(1) null, align 4
   %tmp4 = load i32, ptr addrspace(1) %arg1.global, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 1fd5f7f8f9bb3b..96770e84dd8548 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -88,9 +88,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s12, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_uaddo_i32:
@@ -100,13 +103,15 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_uaddo_i32:
@@ -117,9 +122,11 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -154,9 +161,12 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_uaddo_i32:
@@ -175,9 +185,11 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_uaddo_i32:
@@ -189,9 +201,11 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -232,12 +246,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    ;;#ASMSTART
 ; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
@@ -256,13 +273,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ;;#ASMEND
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -274,13 +293,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v2
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -304,22 +325,25 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_add_u32 s6, s4, s6
-; SI-NEXT:    s_addc_u32 s7, s5, s7
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s12, s2
 ; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_add_u32 s0, s4, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_addc_u32 s1, s5, s7
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -333,31 +357,35 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_addc_u32 s1, s5, s7
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v7, s1
 ; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_uaddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, s8, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -393,9 +421,11 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -418,7 +448,9 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; VI-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[6:7], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -434,7 +466,9 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -561,11 +595,16 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_uaddo_v2i32:
@@ -584,11 +623,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, v2
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; VI-NEXT:    flat_store_dwordx2 v[6:7], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_uaddo_v2i32:
@@ -600,11 +643,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
@@ -633,9 +680,11 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
@@ -658,14 +707,16 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; VI-NEXT:  .LBB8_2: ; %exit
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_mov_b32_e32 v3, s6
 ; VI-NEXT:    v_mov_b32_e32 v4, s7
 ; VI-NEXT:    flat_store_dword v[1:2], v0
-; VI-NEXT:    flat_store_byte v[3:4], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_uaddo_clamp_bit:
@@ -681,8 +732,10 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; GFX9-NEXT:  .LBB8_2: ; %exit
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_store_byte v1, v2, s[6:7]
@@ -734,9 +787,11 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -758,12 +813,14 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:  ; %bb.1: ; %if
 ; VI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
 ; VI-NEXT:  .LBB9_2: ; %exit
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], exec
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s6
 ; VI-NEXT:    v_mov_b32_e32 v4, s7
 ; VI-NEXT:    flat_store_dword v[1:2], v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -782,8 +839,10 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:  ; %bb.1: ; %if
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
 ; GFX9-NEXT:  .LBB9_2: ; %exit
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index a90454f50d198c..10cdd0dc13c8a2 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -126,7 +126,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
@@ -142,16 +141,21 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s11, 1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s12
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s14
 ; GCN-IR-NEXT:    s_add_u32 s14, s4, -1
@@ -832,7 +836,6 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-LABEL: s_test_udiv24_i48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0xb
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_and_b32 s1, s5, 0xffff
 ; GCN-IR-NEXT:    s_and_b32 s0, s4, 0xff000000
@@ -856,17 +859,22 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s9
 ; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s8
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
+; GCN-IR-NEXT:    s_cmp_lg_u32 s11, 1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], s12
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s6, 63, s12
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], s6
+; GCN-IR-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB7_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[8:9], s14
 ; GCN-IR-NEXT:    s_add_u32 s14, s0, -1
@@ -1038,17 +1046,21 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_and_b64 s[6:7], s[10:11], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s7, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s7, 1
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s8
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s6, 63, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB8_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
 ; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
@@ -1406,17 +1418,21 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 1
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s8
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_4
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s6, 63, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB11_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[2:3], s10
 ; GCN-IR-NEXT:    s_add_u32 s2, s12, 0xffffffc4
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 0acee5bd5ac19d..bdc4b24462eebd 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -52,14 +52,15 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    s_waitcnt expcnt(1)
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:  .LBB1_1: ; %bb9
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB1_1
+; CHECK-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; CHECK-NEXT:    s_cselect_b32 s2, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s2, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB1_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb11
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
 ; CHECK-NEXT:    s_mov_b32 s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index a3fc6ded0a0047..c668a87241ca76 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -505,9 +505,9 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p
 ; SI-NEXT:  .LBB9_2: ; %bb9
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  .LBB9_3: ; %bb7
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xb
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -528,9 +528,9 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p
 ; VI-NEXT:  .LBB9_2: ; %bb9
 ; VI-NEXT:    s_endpgm
 ; VI-NEXT:  .LBB9_3: ; %bb7
-; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x2c
-; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    v_subrev_u32_e32 v0, vcc, s4, v0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index a5e1506114f2d0..93287d9c6743c7 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -36,17 +36,32 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b64 s[20:21], -1
 ; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s22, v0
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], 0
 ; GCN-NEXT:    s_mov_b64 s[18:19], 0
-; GCN-NEXT:    s_cbranch_vccz .LBB0_9
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GCN-NEXT:    s_cbranch_vccz .LBB0_10
-; GCN-NEXT:  .LBB0_2: ; %Flow1
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_4
-; GCN-NEXT:  .LBB0_3: ; %bb9
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_2
+; GCN-NEXT:  ; %bb.1: ; %bb2
+; GCN-NEXT:    s_cmp_eq_u32 s22, 21
+; GCN-NEXT:    s_cselect_b64 s[46:47], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s22, 21
+; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT:    s_mov_b64 s[20:21], 0
+; GCN-NEXT:  .LBB0_2: ; %Flow
+; GCN-NEXT:    s_and_b64 s[20:21], s[20:21], exec
+; GCN-NEXT:    s_cselect_b32 s20, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s20, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_4
+; GCN-NEXT:  ; %bb.3: ; %bb4
+; GCN-NEXT:    s_mov_b64 s[16:17], -1
+; GCN-NEXT:    s_cmp_lg_u32 s22, 9
+; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GCN-NEXT:  .LBB0_4: ; %Flow1
+; GCN-NEXT:    s_and_b64 s[18:19], s[18:19], exec
+; GCN-NEXT:    s_cselect_b32 s18, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s18, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_6
+; GCN-NEXT:  ; %bb.5: ; %bb9
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, wibble at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
@@ -74,25 +89,27 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_andn2_b64 s[18:19], s[46:47], exec
 ; GCN-NEXT:    s_and_b64 s[20:21], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[46:47], s[18:19], s[20:21]
-; GCN-NEXT:  .LBB0_4: ; %Flow2
+; GCN-NEXT:  .LBB0_6: ; %Flow2
 ; GCN-NEXT:    s_and_saveexec_b64 s[18:19], s[46:47]
 ; GCN-NEXT:    s_xor_b64 s[18:19], exec, s[18:19]
-; GCN-NEXT:    s_cbranch_execz .LBB0_6
-; GCN-NEXT:  ; %bb.5: ; %bb12
+; GCN-NEXT:    s_cbranch_execz .LBB0_8
+; GCN-NEXT:  ; %bb.7: ; %bb12
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
-; GCN-NEXT:  .LBB0_6: ; %Flow3
+; GCN-NEXT:  .LBB0_8: ; %Flow3
 ; GCN-NEXT:    s_or_b64 exec, exec, s[18:19]
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_8
-; GCN-NEXT:  ; %bb.7: ; %bb7
+; GCN-NEXT:    s_and_b64 s[16:17], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s16, 1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 1
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_10
+; GCN-NEXT:  ; %bb.9: ; %bb7
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, wibble at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble at rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
+; GCN-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; GCN-NEXT:    v_readlane_b32 s47, v41, 15
 ; GCN-NEXT:    v_readlane_b32 s46, v41, 14
 ; GCN-NEXT:    v_readlane_b32 s45, v41, 13
@@ -118,17 +135,6 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB0_9: ; %bb2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[46:47], 21, v0
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 21, v0
-; GCN-NEXT:    s_mov_b64 vcc, exec
-; GCN-NEXT:    s_cbranch_execnz .LBB0_2
-; GCN-NEXT:  .LBB0_10: ; %bb4
-; GCN-NEXT:    s_mov_b64 s[16:17], -1
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 9, v0
-; GCN-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-NEXT:    s_cbranch_vccz .LBB0_3
-; GCN-NEXT:    s_branch .LBB0_4
 ; SI-OPT-LABEL: @widget(
 ; SI-OPT-NEXT:  bb:
 ; SI-OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(1) null, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 3f346db3f3e665..02ca2189b12faa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -125,7 +125,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
@@ -141,16 +140,21 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[14:15], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s11, 1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s12
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_cmp_lg_u32 s13, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s14
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
@@ -898,17 +902,21 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_and_b64 s[6:7], s[10:11], exec
 ; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-IR-NEXT:    s_cselect_b32 s7, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s7, 1
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_5
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB6_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s8
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_4
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s6, 63, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB6_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
 ; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
@@ -1015,17 +1023,21 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 1
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s8
-; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 1, 0
+; GCN-IR-NEXT:    s_sub_i32 s6, 63, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 1
+; GCN-IR-NEXT:    s_cbranch_scc0 .LBB7_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s10
 ; GCN-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc4
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index cf13bb2efcae7c..690cd1931eb37e 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -88,9 +88,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s9, s7
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_usubo_i32:
@@ -100,13 +103,15 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_usubo_i32:
@@ -117,9 +122,11 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
@@ -154,9 +161,12 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
-; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_usubo_i32:
@@ -175,9 +185,11 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dword v[0:1], v4
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_usubo_i32:
@@ -189,9 +201,11 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -232,12 +246,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    ;;#ASMSTART
 ; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_endpgm
 ;
@@ -256,13 +273,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_sub_u32_e64 v4, s[0:1], v4, v5
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ;;#ASMEND
-; VI-NEXT:    flat_store_byte v[2:3], v5
+; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -274,13 +293,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v1, s[0:1], v1, v2
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    global_store_byte v0, v2, s[6:7]
+; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -304,22 +325,25 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_u32 s6, s4, s6
-; SI-NEXT:    s_subb_u32 s7, s5, s7
 ; SI-NEXT:    s_mov_b32 s14, s10
 ; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
 ; SI-NEXT:    s_mov_b32 s12, s2
 ; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_sub_u32 s0, s4, s6
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_subb_u32 s1, s5, s7
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -333,31 +357,35 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_subb_u32 s1, s5, s7
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    v_mov_b32_e32 v7, s1
 ; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[2:3], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_usubo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sub_u32 s0, s8, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_subb_u32 s1, s9, s11
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_byte v2, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
@@ -393,9 +421,11 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, v0, v2
 ; SI-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -418,7 +448,9 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
 ; VI-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[2:3]
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[6:7], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -434,7 +466,9 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    global_store_byte v4, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -561,11 +595,16 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_sub_i32_e64 v0, s[0:1], v0, v2
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s2
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_usubo_v2i32:
@@ -584,11 +623,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v7, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
-; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_sub_u32_e64 v0, s[0:1], v0, v2
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; VI-NEXT:    flat_store_dwordx2 v[6:7], v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_usubo_v2i32:
@@ -600,11 +643,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], v0, v2
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
   %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
   %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
@@ -633,9 +680,11 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
@@ -658,14 +707,16 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; VI-NEXT:  .LBB8_2: ; %exit
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_mov_b32_e32 v3, s6
 ; VI-NEXT:    v_mov_b32_e32 v4, s7
 ; VI-NEXT:    flat_store_dword v[1:2], v0
-; VI-NEXT:    flat_store_byte v[3:4], v5
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_byte v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_usubo_clamp_bit:
@@ -681,8 +732,10 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
 ; GFX9-NEXT:  .LBB8_2: ; %exit
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_store_byte v1, v2, s[6:7]
@@ -735,9 +788,11 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s2
 ; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; SI-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -759,12 +814,14 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:  ; %bb.1: ; %if
 ; VI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
 ; VI-NEXT:  .LBB9_2: ; %exit
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], exec
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s6
 ; VI-NEXT:    v_mov_b32_e32 v4, s7
 ; VI-NEXT:    flat_store_dword v[1:2], v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -783,8 +840,10 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:  ; %bb.1: ; %if
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
 ; GFX9-NEXT:  .LBB9_2: ; %exit
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
index 39c7538738eb15..b9ef7ac9ed8e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll
@@ -5,12 +5,12 @@ define amdgpu_kernel void @icmp_test() {
 ; CHECK-LABEL: icmp_test:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    v_cmp_eq_u16_e64 s[0:1], 0, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    ds_store_b32 v1, v0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    ds_store_b32 v0, v1
 ; CHECK-NEXT:    s_endpgm
 entry:
   %icmp.intr = tail call i64 @llvm.amdgcn.icmp.i64.i16(i16 0, i16 0, i32 32)
@@ -24,16 +24,16 @@ define amdgpu_kernel void @fcmp_test(half %x, half %y) {
 ; CHECK-LABEL: fcmp_test:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_b32 s0, s[2:3], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_lshr_b32 s1, s0, 16
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; CHECK-NEXT:    v_cmp_le_f16_e64 s[0:1], s0, s1
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    ds_store_b32 v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    ds_store_b32 v0, v1
 ; CHECK-NEXT:    s_endpgm
 entry:
   %fcmp.intr = tail call i64 @llvm.amdgcn.fcmp.i64.f16(half %x, half %y, i32 5)
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index aec86ec343bdb5..18b8eb55c2ce2b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -15,14 +15,16 @@ target datalayout = "A5"
 ; GCN-ALLOCA:         buffer_load_dword
 
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
-; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
+; GCN-PROMOTE: s_cselect_b32 s2, 1, 0
+; GCN-PROMOTE: s_cmp_lg_u32 s4, 2
+; GCN-PROMOTE: s_cselect_b32 s2, s2, 2 
+; GCN-PROMOTE: s_cmp_lg_u32 s4, 3 
+; GCN-PROMOTE: s_cselect_b32 s2, s2, 3 
+; GCN-PROMOTE: v_mov_b32_e32 v0, s0 
+; GCN-PROMOTE: v_mov_b32_e32 v1, s1 
+; GCN-PROMOTE: v_mov_b32_e32 v2, s2
+; GCN-PROMOTE: flat_store_dword v[0:1], v2
+; GCN-PROMOTE: s_endpgm
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) {
@@ -51,8 +53,10 @@ entry:
 ; GCN-ALLOCA-COUNT-5: buffer_store_dword
 ; GCN-ALLOCA:         buffer_load_dword
 
-; GCN-PROMOTE-COUNT-7: v_cndmask
-
+; GCN-PROMOTE: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-PROMOTE: s_waitcnt lgkmcnt(0)
+; GCN-PROMOTE: s_cmp_eq_u32 s2, 1
+; GCN-PROMOTE: s_cselect_b32 s4, 1, 0
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
@@ -292,15 +296,17 @@ entry:
 ; GCN-ALLOCA:         buffer_load_dword
 
 ; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
-; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
-; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
-; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
-
+; GCN-PROMOTE: s_cselect_b32 s2, 1, 0
+; GCN-PROMOTE: s_cmp_lg_u32 s4, 2
+; GCN-PROMOTE: s_cselect_b32 s2, s2, 2 
+; GCN-PROMOTE: s_cmp_lg_u32 s4, 3 
+; GCN-PROMOTE: s_cselect_b32 s2, s2, 3 
+; GCN-PROMOTE: s_add_i32 s2, s2, 1
+; GCN-PROMOTE: v_mov_b32_e32 v0, s0 
+; GCN-PROMOTE: v_mov_b32_e32 v1, s1 
+; GCN-PROMOTE: v_mov_b32_e32 v2, s2
+; GCN-PROMOTE: flat_store_dword v[0:1], v2
+; GCN-PROMOTE: s_endpgm
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) {
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4576d829b0cb0a..12a1221ebcf837 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -143,7 +143,9 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 {
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_cmp_class_f32_e64 s2, s4, 0x204
-; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX1032-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX1032-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1032-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1032-NEXT:    s_endpgm
 ;
@@ -155,7 +157,9 @@ define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 {
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], s4, 0x204
-; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX1064-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX1064-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1064-NEXT:    s_endpgm
   %fabs = tail call float @llvm.fabs.f32(float %x)
@@ -737,7 +741,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1032-NEXT:    s_cbranch_scc0 .LBB15_4
+; GFX1032-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s5
@@ -856,9 +860,16 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s1, v2, vcc_lo
-; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s2
-; GFX1032-NEXT:    s_cbranch_vccnz .LBB15_3
+; GFX1032-NEXT:    s_branch .LBB15_3
 ; GFX1032-NEXT:  .LBB15_2:
+; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:  .LBB15_3: ; %Flow
+; GFX1032-NEXT:    s_and_b32 s0, s2, exec_lo
+; GFX1032-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1032-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX1032-NEXT:  ; %bb.4:
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1032-NEXT:    s_sub_i32 s1, 0, s4
 ; GFX1032-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -882,13 +893,10 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032-NEXT:  .LBB15_3:
+; GFX1032-NEXT:  .LBB15_5:
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] offset:16
 ; GFX1032-NEXT:    s_endpgm
-; GFX1032-NEXT:  .LBB15_4:
-; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT:    s_branch .LBB15_2
 ;
 ; GFX1064-LABEL: test_udiv64:
 ; GFX1064:       ; %bb.0: ; %bb
@@ -899,7 +907,7 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
 ; GFX1064-NEXT:    s_mov_b32 s0, 0
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    s_cbranch_scc0 .LBB15_4
+; GFX1064-NEXT:    s_cbranch_scc0 .LBB15_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s5
@@ -1013,13 +1021,22 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, s7, v0, s[0:1]
+; GFX1064-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v2, s2, v2, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
 ; GFX1064-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s11, v1, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s10, v2, vcc
-; GFX1064-NEXT:    s_cbranch_execnz .LBB15_3
+; GFX1064-NEXT:    s_branch .LBB15_3
 ; GFX1064-NEXT:  .LBB15_2:
+; GFX1064-NEXT:    s_mov_b64 s[0:1], -1
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:  .LBB15_3: ; %Flow
+; GFX1064-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX1064-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX1064-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX1064-NEXT:  ; %bb.4:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1064-NEXT:    s_sub_i32 s1, 0, s4
 ; GFX1064-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1043,13 +1060,10 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1064-NEXT:  .LBB15_3:
+; GFX1064-NEXT:  .LBB15_5:
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9] offset:16
 ; GFX1064-NEXT:    s_endpgm
-; GFX1064-NEXT:  .LBB15_4:
-; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX1064-NEXT:    s_branch .LBB15_2
 bb:
   %tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1
   %tmp1 = load i64, ptr addrspace(1) %tmp, align 8
@@ -1810,13 +1824,19 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
 ; GFX1032-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1032-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1032-NEXT:    s_branch .LBB33_2
+; GFX1032-NEXT:    s_branch .LBB33_3
 ; GFX1032-NEXT:  .LBB33_1: ; %body
-; GFX1032-NEXT:    ; in Loop: Header=BB33_2 Depth=1
+; GFX1032-NEXT:    ; in Loop: Header=BB33_3 Depth=1
 ; GFX1032-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX1032-NEXT:    v_add_f32_e32 v8, 2.0, v8
-; GFX1032-NEXT:    s_cbranch_execz .LBB33_4
-; GFX1032-NEXT:  .LBB33_2: ; %loop
+; GFX1032-NEXT:    s_mov_b32 s1, 0
+; GFX1032-NEXT:  .LBB33_2: ; %Flow
+; GFX1032-NEXT:    ; in Loop: Header=BB33_3 Depth=1
+; GFX1032-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX1032-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX1032-NEXT:    s_cmp_lg_u32 s1, 1
+; GFX1032-NEXT:    s_cbranch_scc0 .LBB33_5
+; GFX1032-NEXT:  .LBB33_3: ; %loop
 ; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -1825,10 +1845,12 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
 ; GFX1032-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX1032-NEXT:    s_cbranch_vccz .LBB33_1
-; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:  ; %bb.4: ; in Loop: Header=BB33_3 Depth=1
+; GFX1032-NEXT:    s_mov_b32 s1, -1
 ; GFX1032-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX1032-NEXT:    ; implicit-def: $vgpr8
-; GFX1032-NEXT:  .LBB33_4: ; %break
+; GFX1032-NEXT:    s_branch .LBB33_2
+; GFX1032-NEXT:  .LBB33_5: ; %break
 ; GFX1032-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    ; return to shader part epilog
@@ -1842,13 +1864,19 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
 ; GFX1064-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX1064-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1064-NEXT:    s_branch .LBB33_2
+; GFX1064-NEXT:    s_branch .LBB33_3
 ; GFX1064-NEXT:  .LBB33_1: ; %body
-; GFX1064-NEXT:    ; in Loop: Header=BB33_2 Depth=1
+; GFX1064-NEXT:    ; in Loop: Header=BB33_3 Depth=1
 ; GFX1064-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX1064-NEXT:    v_add_f32_e32 v8, 2.0, v8
-; GFX1064-NEXT:    s_cbranch_execz .LBB33_4
-; GFX1064-NEXT:  .LBB33_2: ; %loop
+; GFX1064-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1064-NEXT:  .LBB33_2: ; %Flow
+; GFX1064-NEXT:    ; in Loop: Header=BB33_3 Depth=1
+; GFX1064-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX1064-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX1064-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX1064-NEXT:    s_cbranch_scc0 .LBB33_5
+; GFX1064-NEXT:  .LBB33_3: ; %loop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    v_cmp_lt_f32_e32 vcc, 0x40e00000, v8
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -1857,10 +1885,12 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
 ; GFX1064-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX1064-NEXT:    s_cbranch_vccz .LBB33_1
-; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:  ; %bb.4: ; in Loop: Header=BB33_3 Depth=1
+; GFX1064-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX1064-NEXT:    ; implicit-def: $vgpr8
-; GFX1064-NEXT:  .LBB33_4: ; %break
+; GFX1064-NEXT:    s_branch .LBB33_2
+; GFX1064-NEXT:  .LBB33_5: ; %break
 ; GFX1064-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index ab84c0c905771b..2f915657499de6 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1935,13 +1935,19 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
-; GFX9-W64-NEXT:    s_branch .LBB35_2
+; GFX9-W64-NEXT:    s_branch .LBB35_3
 ; GFX9-W64-NEXT:  .LBB35_1: ; %body
-; GFX9-W64-NEXT:    ; in Loop: Header=BB35_2 Depth=1
+; GFX9-W64-NEXT:    ; in Loop: Header=BB35_3 Depth=1
 ; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
-; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_4
-; GFX9-W64-NEXT:  .LBB35_2: ; %loop
+; GFX9-W64-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-W64-NEXT:  .LBB35_2: ; %Flow
+; GFX9-W64-NEXT:    ; in Loop: Header=BB35_3 Depth=1
+; GFX9-W64-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-W64-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-W64-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB35_5
+; GFX9-W64-NEXT:  .LBB35_3: ; %loop
 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
@@ -1950,10 +1956,12 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-W64-NEXT:    s_cbranch_vccz .LBB35_1
-; GFX9-W64-NEXT:  ; %bb.3:
+; GFX9-W64-NEXT:  ; %bb.4: ; in Loop: Header=BB35_3 Depth=1
+; GFX9-W64-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
-; GFX9-W64-NEXT:  .LBB35_4: ; %break
+; GFX9-W64-NEXT:    s_branch .LBB35_2
+; GFX9-W64-NEXT:  .LBB35_5: ; %break
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    ; return to shader part epilog
@@ -1966,14 +1974,20 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    s_branch .LBB35_2
+; GFX10-W32-NEXT:    s_branch .LBB35_3
 ; GFX10-W32-NEXT:    .p2align 6
 ; GFX10-W32-NEXT:  .LBB35_1: ; %body
-; GFX10-W32-NEXT:    ; in Loop: Header=BB35_2 Depth=1
+; GFX10-W32-NEXT:    ; in Loop: Header=BB35_3 Depth=1
 ; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
-; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_4
-; GFX10-W32-NEXT:  .LBB35_2: ; %loop
+; GFX10-W32-NEXT:    s_mov_b32 s1, 0
+; GFX10-W32-NEXT:  .LBB35_2: ; %Flow
+; GFX10-W32-NEXT:    ; in Loop: Header=BB35_3 Depth=1
+; GFX10-W32-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX10-W32-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-W32-NEXT:    s_cmp_lg_u32 s1, 1
+; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB35_5
+; GFX10-W32-NEXT:  .LBB35_3: ; %loop
 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-W32-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
@@ -1982,10 +1996,12 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-W32-NEXT:    s_cbranch_vccz .LBB35_1
-; GFX10-W32-NEXT:  ; %bb.3:
+; GFX10-W32-NEXT:  ; %bb.4: ; in Loop: Header=BB35_3 Depth=1
+; GFX10-W32-NEXT:    s_mov_b32 s1, -1
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
-; GFX10-W32-NEXT:  .LBB35_4: ; %break
+; GFX10-W32-NEXT:    s_branch .LBB35_2
+; GFX10-W32-NEXT:  .LBB35_5: ; %break
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
@@ -2182,15 +2198,21 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
-; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_3
-; GFX9-W64-NEXT:    s_branch .LBB39_4
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-W64-NEXT:    s_branch .LBB39_3
 ; GFX9-W64-NEXT:  .LBB39_2:
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX9-W64-NEXT:  .LBB39_3: ; %if
+; GFX9-W64-NEXT:  .LBB39_3: ; %Flow
+; GFX9-W64-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-W64-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-W64-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX9-W64-NEXT:    s_cbranch_scc1 .LBB39_5
+; GFX9-W64-NEXT:  ; %bb.4: ; %if
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
-; GFX9-W64-NEXT:  .LBB39_4: ; %end
+; GFX9-W64-NEXT:  .LBB39_5: ; %end
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX9-W64-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
@@ -2208,15 +2230,21 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
-; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_3
-; GFX10-W32-NEXT:    s_branch .LBB39_4
+; GFX10-W32-NEXT:    s_mov_b32 s0, 0
+; GFX10-W32-NEXT:    s_branch .LBB39_3
 ; GFX10-W32-NEXT:  .LBB39_2:
+; GFX10-W32-NEXT:    s_mov_b32 s0, -1
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX10-W32-NEXT:  .LBB39_3: ; %if
+; GFX10-W32-NEXT:  .LBB39_3: ; %Flow
+; GFX10-W32-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX10-W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-W32-NEXT:    s_cmp_lg_u32 s0, 1
+; GFX10-W32-NEXT:    s_cbranch_scc1 .LBB39_5
+; GFX10-W32-NEXT:  ; %bb.4: ; %if
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:  .LBB39_4: ; %end
+; GFX10-W32-NEXT:  .LBB39_5: ; %end
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 9fac17f33d0d36..290476dcb7d355 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -519,29 +519,33 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
 ; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
-; SI-NEXT:    s_cbranch_vccz .LBB12_4
+; SI-NEXT:    s_cbranch_vccz .LBB12_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s15, 0xf000
 ; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    s_mov_b32 s12, s2
 ; SI-NEXT:    s_mov_b32 s13, s3
 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[12:15], 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT:    s_cbranch_vccnz .LBB12_3
-; SI-NEXT:  .LBB12_2: ; %if
+; SI-NEXT:    s_branch .LBB12_3
+; SI-NEXT:  .LBB12_2:
+; SI-NEXT:    s_mov_b64 s[8:9], -1
+; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SI-NEXT:  .LBB12_3: ; %Flow
+; SI-NEXT:    s_and_b64 s[2:3], s[8:9], exec
+; SI-NEXT:    s_cselect_b32 s2, 1, 0
+; SI-NEXT:    s_cmp_lg_u32 s2, 1
+; SI-NEXT:    s_cbranch_scc1 .LBB12_5
+; SI-NEXT:  ; %bb.4: ; %if
 ; SI-NEXT:    s_xor_b64 s[2:3], s[4:5], s[6:7]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:  .LBB12_3: ; %endif
+; SI-NEXT:  .LBB12_5: ; %endif
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB12_4:
-; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; SI-NEXT:    s_branch .LBB12_2
 ;
 ; VI-LABEL: xor_cf:
 ; VI:       ; %bb.0: ; %entry
@@ -549,27 +553,31 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
 ; VI-NEXT:    s_mov_b64 s[8:9], 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; VI-NEXT:    s_cbranch_scc0 .LBB12_4
+; VI-NEXT:    s_cbranch_scc0 .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; VI-NEXT:    s_cbranch_vccnz .LBB12_3
-; VI-NEXT:  .LBB12_2: ; %if
+; VI-NEXT:    s_branch .LBB12_3
+; VI-NEXT:  .LBB12_2:
+; VI-NEXT:    s_mov_b64 s[8:9], -1
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:  .LBB12_3: ; %Flow
+; VI-NEXT:    s_and_b64 s[2:3], s[8:9], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s2, 1
+; VI-NEXT:    s_cbranch_scc1 .LBB12_5
+; VI-NEXT:  ; %bb.4: ; %if
 ; VI-NEXT:    s_xor_b64 s[2:3], s[4:5], s[6:7]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:  .LBB12_3: ; %endif
+; VI-NEXT:  .LBB12_5: ; %endif
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
-; VI-NEXT:  .LBB12_4:
-; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; VI-NEXT:    s_branch .LBB12_2
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index af50e09f509a3b..fbab12b6584899 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -19,7 +19,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i32
-; GCN: v_cndmask_b32
+; GCN: s_cselect_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
@@ -54,9 +54,9 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
 ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
 ; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
-; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; GCN: buffer_store_short [[RESULT]]
+; FIXME: for mcpu=tonga, a v_cndmask_b32_e64 instruction is (still) created,
+; for the other mcpu runs, a s_cselect_b64 instruction
+; GCN: buffer_store_short
 define amdgpu_kernel void @s_cmp_zext_i1_to_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
index e2dada85ef8729..f1aed9d54dc419 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll
@@ -121,31 +121,31 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_ashr_i32 s1, s0, 31
 ; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
-; CHECK-NEXT:    s_add_u32 s0, s4, s0
-; CHECK-NEXT:    s_addc_u32 s1, s5, s1
-; CHECK-NEXT:    s_add_u32 s2, s0, -8
-; CHECK-NEXT:    s_addc_u32 s3, s1, -1
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 9
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT:    s_add_u32 s2, s4, s0
+; CHECK-NEXT:    s_addc_u32 s3, s5, s1
+; CHECK-NEXT:    s_add_u32 s0, s2, -8
+; CHECK-NEXT:    s_addc_u32 s1, s3, -1
+; CHECK-NEXT:    s_cmp_eq_u64 s[2:3], 9
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:  .LBB3_1: ; %bb0
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; CHECK-NEXT:    s_cbranch_vccnz .LBB3_1
+; CHECK-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 1
+; CHECK-NEXT:    s_cbranch_scc1 .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb1
-; CHECK-NEXT:    s_mov_b64 s[0:1], exec
-; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; CHECK-NEXT:    s_mov_b64 s[2:3], exec
+; CHECK-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; CHECK-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
-; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
+; CHECK-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; CHECK-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; CHECK-NEXT:    v_mul_f64 v[0:1], s[6:7], v[0:1]
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; CHECK-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
 ; CHECK-NEXT:  .LBB3_4:



More information about the llvm-commits mailing list