[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 15 00:14:21 PDT 2024


https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/98864

Optimize V_SET_INACTIVE by allow it to run in WWM.
Hence WWM sections are not broken up for inactive lane setting.
WWM V_SET_INACTIVE can typically be lower to V_CNDMASK.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.

Additionally avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
This is a common pattern as WWM register pre-allocation often
assigns the same register.

>From 574004452bc1a3ce1e0be12764ffb67e4d3cb668 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Sun, 14 Jul 2024 11:08:21 +0900
Subject: [PATCH 1/2] [AMDGPU] Ensure that V_SET_INACTIVE inactive input is WWM
 computed

WWM global flag must be set to ensure V_SET_INACTIVE inactive lane
input is computed in WWM.
Full lowering may be skipped if global flag is not present.
---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |   1 +
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    | 124 ++++++-----
 .../llvm.amdgcn.set.inactive.chain.arg.ll     | 192 +++++++++++++-----
 3 files changed, 217 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 19e761ef45b25..188c69a8733f9 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -564,6 +564,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           }
         }
         SetInactiveInstrs.push_back(&MI);
+        GlobalFlags |= StateStrictWWM;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 0c60be9d94591..db79d27759fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -96,14 +96,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -117,16 +119,18 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0xcccccccd
+; GCN-NEXT:    s_mov_b32 s7, 0x4010cccc
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -140,14 +144,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -160,14 +166,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -181,16 +189,18 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -205,16 +215,18 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1.0
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -228,14 +240,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -249,16 +263,18 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x10001
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -273,16 +289,18 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-LABEL: set_inactive_v4f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
@@ -297,16 +315,18 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x3f803f80
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0x3f803f80
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v1
 ; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index b3acd4949301e..e6acd9996b09a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -12,9 +12,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11-LABEL: set_inactive_chain_arg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
 ; GFX11-NEXT:    s_endpgm
@@ -22,9 +26,12 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX10-LABEL: set_inactive_chain_arg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    global_store_dword v[8:9], v0, off
 ; GFX10-NEXT:    s_endpgm
@@ -32,9 +39,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11_W64-LABEL: set_inactive_chain_arg:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
 ; GFX11_W64-NEXT:    s_endpgm
@@ -42,9 +53,12 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX10_W64-LABEL: set_inactive_chain_arg:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v0
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    global_store_dword v[8:9], v0, off
 ; GFX10_W64-NEXT:    s_endpgm
@@ -54,53 +68,135 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 }
 
 define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) {
-; GFX11-LABEL: set_inactive_chain_arg_64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11-NEXT:    v_mov_b32_e32 v1, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT:    s_endpgm
+; GISEL11-LABEL: set_inactive_chain_arg_64:
+; GISEL11:       ; %bb.0:
+; GISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11
+; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL11-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL11-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL11-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL11-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; GISEL11-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: set_inactive_chain_arg_64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10-NEXT:    v_mov_b32_e32 v1, v13
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10-NEXT:    v_mov_b32_e32 v1, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10-NEXT:    s_endpgm
+; DAGISEL11-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11:       ; %bb.0:
+; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT:    v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL11-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL11-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; DAGISEL11-NEXT:    s_endpgm
 ;
-; GFX11_W64-LABEL: set_inactive_chain_arg_64:
-; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v12
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT:    s_endpgm
+; GISEL10-LABEL: set_inactive_chain_arg_64:
+; GISEL10:       ; %bb.0:
+; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL10-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; GISEL10-NEXT:    s_endpgm
 ;
-; GFX10_W64-LABEL: set_inactive_chain_arg_64:
-; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v12
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10_W64-NEXT:    s_endpgm
+; DAGISEL10-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10:       ; %bb.0:
+; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; DAGISEL10-NEXT:    s_endpgm
+;
+; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL11_W64:       ; %bb.0:
+; GISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL11_W64-NEXT:    s_not_b64 exec, exec
+; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL11_W64-NEXT:    s_not_b64 exec, exec
+; GISEL11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; GISEL11_W64-NEXT:    s_endpgm
+;
+; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11_W64:       ; %bb.0:
+; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
+; DAGISEL11_W64-NEXT:    s_endpgm
+;
+; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL10_W64:       ; %bb.0:
+; GISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
+; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v0
+; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v1
+; GISEL10_W64-NEXT:    s_not_b64 exec, exec
+; GISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; GISEL10_W64-NEXT:    s_endpgm
+;
+; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10_W64:       ; %bb.0:
+; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v11
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v13
+; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v0
+; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v1
+; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
+; DAGISEL10_W64-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
   store i64 %tmp, ptr addrspace(1) %out
   ret void

>From 5736b0812865726490fbd4a2b710f4eb31fa291c Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Sun, 14 Jul 2024 17:31:13 +0900
Subject: [PATCH 2/2] [AMDGPU] V_SET_INACTIVE optimizations

Optimize V_SET_INACTIVE by allow it to run in WWM.
Hence WWM sections are not broken up for inactive lane setting.
WWM V_SET_INACTIVE can typically be lower to V_CNDMASK.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.

Additionally avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
This is a common pattern as WWM register pre-allocation often
assigns the same register.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 169 +++++--
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp    |  57 ++-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |  36 --
 .../atomic_optimizations_pixelshader.ll       |  66 +--
 llvm/test/CodeGen/AMDGPU/cse-convergent.ll    |  14 +-
 llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll |  18 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 300 ++++--------
 .../AMDGPU/global_atomics_scan_fmax.ll        | 246 ++++------
 .../AMDGPU/global_atomics_scan_fmin.ll        | 246 ++++------
 .../AMDGPU/global_atomics_scan_fsub.ll        | 300 ++++--------
 .../llvm.amdgcn.set.inactive.chain.arg.ll     |  79 +--
 .../AMDGPU/set-inactive-wwm-overwrite.ll      |  12 +-
 .../AMDGPU/should-not-hoist-set-inactive.ll   |   5 +-
 llvm/test/CodeGen/AMDGPU/wqm.ll               |  48 +-
 llvm/test/CodeGen/AMDGPU/wqm.mir              |   4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 212 ++++----
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      | 462 +++++++++---------
 17 files changed, 954 insertions(+), 1320 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cc1b9ac0c9ecd..d551a7887e706 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,37 +2273,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
-    // optimizations (mainly Register Coalescer) aware of WWM register liveness.
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-        .add(MI.getOperand(1));
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-      .add(MI.getOperand(2));
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
-    MI.eraseFromParent();
-    break;
-  }
+  case AMDGPU::V_SET_INACTIVE_B32:
   case AMDGPU::V_SET_INACTIVE_B64: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                                 MI.getOperand(0).getReg())
-                             .add(MI.getOperand(1));
-    expandPostRAPseudo(*Copy);
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                   MI.getOperand(0).getReg())
-               .add(MI.getOperand(2));
-    expandPostRAPseudo(*Copy);
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+                           ? AMDGPU::V_MOV_B64_PSEUDO
+                           : AMDGPU::V_MOV_B32_e32;
+    Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+    Register DstReg = MI.getOperand(0).getReg();
+    MachineOperand &ActiveSrc = MI.getOperand(1);
+    MachineOperand &InactiveSrc = MI.getOperand(2);
+
+    bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+
+    // Find implicit exec src if this is running in WWM.
+    Register ExecSrcReg = 0;
+    for (auto &Op : MI.implicit_operands()) {
+      if (Op.isDef() || !Op.isReg())
+        continue;
+      Register OpReg = Op.getReg();
+      if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+          OpReg == AMDGPU::SCC)
+        continue;
+      ExecSrcReg = OpReg;
+      break;
+    }
+
+    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+    // constant bus constraints and the presence of literal constants
+    // present an issue.
+    // Fallback to V_MOV base lowering in all but the common cases.
+    bool InWWM = !!ExecSrcReg;
+    bool UseVCndMask = false;
+    if (InWWM) {
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF->getRegInfo();
+      const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+      const MCInstrDesc &Desc = get(Opcode);
+      int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+      int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+      int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+      int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+      int ConstantBusUses = 1; // Starts at one for ExecRegSrc
+      int LiteralConstants = 0;
+      ConstantBusUses +=
+          usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
+      ConstantBusUses +=
+          usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
+      LiteralConstants +=
+          ActiveSrc.isImm() &&
+                  !isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
+              ? 1
+              : 0;
+      LiteralConstants +=
+          InactiveSrc.isImm() &&
+                  !isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
+              ? 1
+              : 0;
+      UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
+                    LiteralConstants <= LiteralLimit &&
+                    (!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
+    }
+
+    if (UseVCndMask && VMov64) {
+      // WWM B64; decompose to two B32 operations.
+      // Test above ensures that both sources are registers.
+      // Note: this is done to avoid falling back to V_MOV multiple times
+      // and introducing exec manipulation for each VGPR separately.
+      assert(ActiveSrc.isReg() && InactiveSrc.isReg());
+      Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
+      Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
+      Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
+      Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
+      MachineInstr *Tmp;
+      Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+                    RI.getSubReg(DstReg, AMDGPU::sub0))
+                .addReg(InactiveLo)
+                .addReg(ActiveLo)
+                .addReg(ExecSrcReg, RegState::Implicit)
+                .addReg(DstReg, RegState::ImplicitDefine);
+      expandPostRAPseudo(*Tmp);
+      Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+                    RI.getSubReg(DstReg, AMDGPU::sub1))
+                .addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
+                .addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
+                .addReg(ExecSrcReg, RegState::Implicit)
+                .addReg(DstReg, RegState::ImplicitDefine);
+      expandPostRAPseudo(*Tmp);
+    } else if (UseVCndMask) {
+      // WWM B32; use V_CNDMASK.
+      MachineInstr *VCndMask =
+          BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+              .addImm(0)
+              .add(InactiveSrc)
+              .addImm(0)
+              .add(ActiveSrc)
+              .addReg(ExecSrcReg);
+      // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
+      for (auto &Op : MI.implicit_operands()) {
+        if (!Op.isDef())
+          continue;
+        VCndMask->addOperand(Op);
+      }
+    } else {
+      // Fallback V_MOV case.
+      // Avoid unnecessary work if a src is the destination.
+      // This can happen if WWM register allocation was efficient.
+      bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+      bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+      if (!SkipActive) {
+        if (InWWM) {
+          // Cancel WWM
+          BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+        }
+        // Copy active lanes
+        MachineInstr *VMov =
+            BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+                .add(ActiveSrc);
+        if (VMov64)
+          expandPostRAPseudo(*VMov);
+      }
+      if (!SkipInactive) {
+        // Set exec mask to inactive lanes
+        MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
+                                   .addReg(InWWM ? ExecSrcReg : ExecReg);
+        ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+        // Copy inactive lanes
+        MachineInstr *VMov =
+            BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+        if (VMov64)
+          expandPostRAPseudo(*VMov);
+        if (!InWWM) {
+          // Restore original exec mask
+          BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
+        }
+      }
+      if (InWWM) {
+        // Restore WWM
+        BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+      }
+    }
     MI.eraseFromParent();
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 188c69a8733f9..e01c045e7ef3d 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -178,6 +178,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
   SmallVector<MachineInstr *, 4> KillInstrs;
   SmallVector<MachineInstr *, 4> InitExecInstrs;
+  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 
   void printInfo();
 
@@ -225,6 +226,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void lowerInitExec(MachineInstr &MI);
   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
 
+  void harmonizeTransitions();
+
 public:
   static char ID;
 
@@ -477,7 +480,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
-  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
   bool HasImplicitDerivatives =
       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -554,6 +556,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         GlobalFlags |= StateStrictWQM;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+        // Disable strict states here while marking, relax it later.
         III.Disabled = StateStrict;
         MachineOperand &Inactive = MI.getOperand(2);
         if (Inactive.isReg()) {
@@ -565,6 +568,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         }
         SetInactiveInstrs.push_back(&MI);
         GlobalFlags |= StateStrictWWM;
+        BBI.NeedsLowering = true;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -1038,6 +1042,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
 
   SmallVector<MachineInstr *, 4> SplitPoints;
+  Register ActiveLanesReg = 0;
   char State = BI.InitialState;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1054,6 +1059,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(MBB, MI);
       break;
+    case AMDGPU::ENTER_STRICT_WWM:
+      ActiveLanesReg = MI.getOperand(0).getReg();
+      break;
+    case AMDGPU::EXIT_STRICT_WWM:
+      ActiveLanesReg = 0;
+      break;
+    case AMDGPU::V_SET_INACTIVE_B32:
+    case AMDGPU::V_SET_INACTIVE_B64:
+      if (ActiveLanesReg) {
+        MI.addOperand(*MBB.getParent(),
+                      MachineOperand::CreateReg(ActiveLanesReg, false, true));
+      } else
+        assert(State == StateExact || State == StateWQM);
+      break;
     default:
       break;
     }
@@ -1618,6 +1637,40 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
   return InsertPt;
 }
 
+void SIWholeQuadMode::harmonizeTransitions() {
+  // Relax requirements on SET_INACTIVE to allow it in WWM regions.
+  for (MachineInstr *MI : SetInactiveInstrs) {
+    if (MI->getOpcode() == AMDGPU::COPY)
+      continue;
+
+    Instructions[MI].Disabled &= ~StateStrictWWM;
+
+    auto MBB = MI->getParent();
+    auto It = MI->getIterator();
+    if (It == MBB->end())
+      continue;
+
+    bool AddWWM = false;
+    auto NextMI = std::next(It);
+    if (NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+        NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
+      // Groups of SET_INACTIVE are more efficient in WWM.
+      AddWWM = true;
+    } else {
+      // Back propagate WWM needs of next instruction.
+      auto III = Instructions.find(&*NextMI);
+      AddWWM =
+          (III != Instructions.end() && III->second.Needs & StateStrictWWM);
+    }
+
+    if (!AddWWM)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "merge into WWM: " << *MI);
+    Instructions[MI].Needs |= StateStrictWWM;
+  }
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
                     << " ------------- \n");
@@ -1630,6 +1683,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToMovInstrs.clear();
   KillInstrs.clear();
   InitExecInstrs.clear();
+  SetInactiveInstrs.clear();
   StateTransition.clear();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1702,6 +1756,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     LIS->InsertMachineInstrInMaps(*MI);
     lowerKillInstrs(true);
   } else {
+    harmonizeTransitions();
     for (auto BII : Blocks)
       processBlock(*BII.first, BII.first == &Entry);
     // Lowering blocks causes block splitting so perform as a second pass.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index db79d27759fd6..8fb4f2cd79a70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -104,9 +104,6 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -128,10 +125,6 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -152,9 +145,6 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -174,9 +164,6 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half>
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -198,10 +185,6 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -224,10 +207,6 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -248,9 +227,6 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -272,10 +248,6 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -298,10 +270,6 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -324,10 +292,6 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 29704959fc176..099289d59f94d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -277,11 +277,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
 ; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX8-NEXT:    s_nop 1
 ; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -334,11 +332,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
 ; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -386,13 +382,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[10:11]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -450,13 +443,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
 ; GFX1032-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s9
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -505,34 +495,31 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[10:11]
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 15
 ; GFX1164-NEXT:    v_readlane_b32 s13, v1, 31
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_writelane_b32 v3, s12, 16
 ; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
 ; GFX1164-NEXT:    v_readlane_b32 s12, v1, 63
@@ -581,33 +568,30 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:    s_and_saveexec_b32 s8, s9
 ; GFX1132-NEXT:    s_cbranch_execz .LBB1_4
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s9
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    v_readlane_b32 s11, v1, 31
 ; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-NEXT:    v_readlane_b32 s10, v1, 15
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-NEXT:    v_writelane_b32 v3, s10, 16
 ; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 0d74bd39b56fe..7aca63d34f51b 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -12,12 +12,7 @@ define i32 @test(i32 %val, i32 %cond) {
 ; GCN-NEXT:    s_mov_b32 exec_lo, s4
 ; GCN-NEXT:    s_or_saveexec_b32 s4, -1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_mov_b32 exec_lo, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, v0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_or_saveexec_b32 s4, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s4
 ; GCN-NEXT:    v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s4
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
@@ -27,12 +22,7 @@ define i32 @test(i32 %val, i32 %cond) {
 ; GCN-NEXT:  ; %bb.1: ; %if
 ; GCN-NEXT:    s_or_saveexec_b32 s5, -1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_mov_b32 exec_lo, s5
-; GCN-NEXT:    v_mov_b32_e32 v3, v0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_or_saveexec_b32 s5, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s5
 ; GCN-NEXT:    v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s5
 ; GCN-NEXT:    v_mov_b32_e32 v5, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 82dc6d21cfe33..310f32ce8f83b 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -6,16 +6,13 @@
 define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
 ; GCN-LABEL: wwm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s6, s3
 ; GCN-NEXT:    s_mov_b32 s5, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s4, s1
 ; GCN-NEXT:    s_mov_b32 s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, 4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 4, s[2:3]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
@@ -63,16 +60,13 @@ work:
 define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
 ; GCN-LABEL: strict_wwm:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s6, s3
 ; GCN-NEXT:    s_mov_b32 s5, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    s_mov_b32 s7, s4
 ; GCN-NEXT:    s_mov_b32 s4, s1
 ; GCN-NEXT:    s_mov_b32 s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, 4
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 4, s[2:3]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 04df04a5c299b..da26342982ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -710,10 +710,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -792,14 +791,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -874,14 +868,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -940,41 +929,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -1009,15 +992,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1916,10 +1894,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1998,14 +1975,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2080,14 +2052,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2146,41 +2113,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -2228,15 +2189,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3122,10 +3078,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3204,14 +3159,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3286,14 +3236,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3352,41 +3297,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -3421,15 +3360,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3844,10 +3778,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3926,14 +3859,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4008,14 +3936,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4074,41 +3997,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -4143,15 +4060,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5049,10 +4961,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5131,14 +5042,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5213,14 +5119,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5279,41 +5180,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -5361,15 +5256,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 005cd3a0021b3..d15e352f4a1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -611,10 +611,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -702,15 +701,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -780,15 +774,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -840,48 +829,42 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -918,44 +901,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1732,10 +1710,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1823,15 +1800,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1915,15 +1887,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1989,48 +1956,42 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2083,45 +2044,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2913,10 +2869,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3004,15 +2959,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3096,15 +3046,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3170,48 +3115,42 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3264,45 +3203,40 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 3f4779f08e42f..9fa953ff117fe 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -611,10 +611,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -702,15 +701,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -780,15 +774,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -840,48 +829,42 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -918,44 +901,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v4
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -1732,10 +1710,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1823,15 +1800,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1915,15 +1887,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1989,48 +1956,42 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2083,45 +2044,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
@@ -2913,10 +2869,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3004,15 +2959,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1064-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1064-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3096,15 +3046,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
 ; GFX1032-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX1032-DPP-NEXT:    v_min_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3170,48 +3115,42 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3264,45 +3203,40 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
 ; GFX1132-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132-DPP-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX1132-DPP-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-DPP-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 64650e2733a00..8f755728c59c0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -788,10 +788,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -870,14 +869,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -952,14 +946,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1018,41 +1007,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -1100,15 +1083,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2020,10 +1998,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2102,14 +2079,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2184,14 +2156,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2250,41 +2217,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -2332,15 +2293,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3252,10 +3208,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3334,14 +3289,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3416,14 +3366,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3482,41 +3427,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -3564,15 +3503,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4026,10 +3960,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4108,14 +4041,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4190,14 +4118,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4256,41 +4179,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -4338,15 +4255,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5257,10 +5169,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-DPP-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
+; GFX9-DPP-NEXT:    s_not_b64 exec, s[0:1]
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT:    s_not_b64 exec, exec
-; GFX9-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT:    s_mov_b64 exec, -1
 ; GFX9-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
 ; GFX9-DPP-NEXT:    s_nop 1
 ; GFX9-DPP-NEXT:    v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5339,14 +5250,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1064-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1064-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1064-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1064-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5421,14 +5327,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT:    v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT:    v_cndmask_b32_e64 v4, 0x80000000, v0, s0
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT:    v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1032-DPP-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX1032-DPP-NEXT:    v_bfrev_b32_e32 v4, 1
 ; GFX1032-DPP-NEXT:    v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5487,41 +5388,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT:    s_not_b64 exec, exec
-; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1164-DPP-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_permlane64_b32 v2, v1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT:    s_waitcnt_depctr 0xfffe
 ; GFX1164-DPP-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX1164-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
 ; GFX1164-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, v1
@@ -5569,15 +5464,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT:    v_cndmask_b32_e64 v2, 0x80000000, v0, s0
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX1132-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-DPP-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index e6acd9996b09a..a1d24571bbbaa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -16,10 +16,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v0
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -30,9 +26,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v0
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    global_store_dword v[8:9], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -43,10 +36,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
 ; GFX11_W64-NEXT:    s_endpgm
 ;
@@ -57,9 +46,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    global_store_dword v[8:9], v0, off
 ; GFX10_W64-NEXT:    s_endpgm
   %tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
@@ -76,11 +62,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GISEL11-NEXT:    v_mov_b32_e32 v0, v0
-; GISEL11-NEXT:    v_mov_b32_e32 v1, v1
-; GISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GISEL11-NEXT:    global_store_b64 v[8:9], v[0:1], off
 ; GISEL11-NEXT:    s_endpgm
 ;
@@ -92,11 +73,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v1, v13
-; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v0
-; DAGISEL11-NEXT:    v_mov_b32_e32 v1, v1
-; DAGISEL11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; DAGISEL11-NEXT:    global_store_b64 v[8:9], v[0:1], off
 ; DAGISEL11-NEXT:    s_endpgm
 ;
@@ -109,10 +85,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; GISEL10-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL10-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT:    v_mov_b32_e32 v0, v0
-; GISEL10-NEXT:    v_mov_b32_e32 v1, v1
-; GISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GISEL10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
 ; GISEL10-NEXT:    s_endpgm
 ;
@@ -125,10 +97,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v13
-; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT:    v_mov_b32_e32 v0, v0
-; DAGISEL10-NEXT:    v_mov_b32_e32 v1, v1
-; DAGISEL10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; DAGISEL10-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
 ; DAGISEL10-NEXT:    s_endpgm
 ;
@@ -141,11 +109,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL11_W64-NEXT:    s_not_b64 exec, exec
-; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GISEL11_W64-NEXT:    v_mov_b32_e32 v1, v1
-; GISEL11_W64-NEXT:    s_not_b64 exec, exec
 ; GISEL11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
@@ -158,11 +121,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v13
-; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v0
-; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v1, v1
-; DAGISEL11_W64-NEXT:    s_not_b64 exec, exec
 ; DAGISEL11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
@@ -175,10 +133,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL10_W64-NEXT:    s_not_b64 exec, exec
-; GISEL10_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GISEL10_W64-NEXT:    v_mov_b32_e32 v1, v1
-; GISEL10_W64-NEXT:    s_not_b64 exec, exec
 ; GISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
 ; GISEL10_W64-NEXT:    s_endpgm
 ;
@@ -191,10 +145,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v13
-; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v0, v0
-; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v1, v1
-; DAGISEL10_W64-NEXT:    s_not_b64 exec, exec
 ; DAGISEL10_W64-NEXT:    global_store_dwordx2 v[8:9], v[0:1], off
 ; DAGISEL10_W64-NEXT:    s_endpgm
   %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
@@ -209,16 +159,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v0
-; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    global_store_b32 v[8:9], v2, off
 ; GFX11-NEXT:    s_endpgm
@@ -229,11 +176,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v0
-; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s0
@@ -247,17 +191,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
-; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11_W64-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v2, off
 ; GFX11_W64-NEXT:    s_endpgm
@@ -268,11 +208,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
-; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v0
-; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 81858bd3d29ee..f60786c1bacbf 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -15,11 +15,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
 ; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GCN-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-NEXT:  ; %bb.3: ; %.then
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
@@ -82,12 +79,7 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs
 ; GCN-NEXT:  .LBB1_5: ; %.else
 ; GCN-NEXT:    s_or_saveexec_b32 s1, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b32 exec_lo, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, v3
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    s_or_saveexec_b32 s1, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s1
 ; GCN-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 09e342fe19066..90b32e29e98f6 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -23,11 +23,8 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  ; %bb.3: ; %bb1
 ; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GCN-NEXT:    s_or_saveexec_b32 s9, -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, s4, s9
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
 ; GCN-NEXT:    s_mov_b32 exec_lo, s9
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 11003c4c9edfd..2ab0300b97aa0 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -835,12 +835,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
@@ -851,12 +848,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
@@ -2263,11 +2257,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2293,11 +2284,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2744,12 +2732,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s[0:1]
 ; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
@@ -2760,12 +2745,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v0, 0, v2, s0
 ; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
@@ -2799,11 +2781,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
-; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT:    s_not_b64 exec, exec
 ; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
 ; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2829,11 +2808,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s0
 ; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
 ; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index ef6d0780f395f..534865173d9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -282,10 +282,10 @@ body:             |
 #
 #CHECK-NOT: ENTER_STRICT_WWM
 #CHECK: BUFFER_LOAD_DWORDX2
-#CHECK-NOT: ENTER_STRICT_WWM
+#CHECK: ENTER_STRICT_WWM
 #CHECK: V_SET_INACTIVE_B32
 #CHECK: V_SET_INACTIVE_B32
-#CHECK: ENTER_STRICT_WWM
+#CHECK-NOT: ENTER_STRICT_WWM
 #CHECK: V_MAX
 name:            test_wwm_set_inactive_propagation
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e79cb66dcd776..b62fe599f88ac 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,17 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -80,17 +82,10 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[34:35]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[34:35]
+; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -177,11 +172,12 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -208,12 +204,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[34:35]
+; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -270,34 +262,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[34:35]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[34:35], vcc
-; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[36:37]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT:  .LBB1_2: ; %merge
+; GFX9-O3-NEXT:  ; %bb.2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[34:35]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -559,7 +542,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s48, s33
+; GFX9-O0-NEXT:    s_mov_b32 s50, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -583,41 +566,42 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s38, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s39, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
-; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
-; GFX9-O0-NEXT:    ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s39, s7
-; GFX9-O0-NEXT:    s_mov_b32 s35, s39
-; GFX9-O0-NEXT:    s_mov_b32 s44, s38
+; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s35, s41
+; GFX9-O0-NEXT:    s_mov_b32 s42, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
-; GFX9-O0-NEXT:    s_mov_b32 s45, s37
-; GFX9-O0-NEXT:    s_mov_b32 s40, s36
-; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
-; GFX9-O0-NEXT:    s_mov_b32 s41, s45
-; GFX9-O0-NEXT:    s_mov_b32 s42, s44
-; GFX9-O0-NEXT:    s_mov_b32 s43, s35
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
+; GFX9-O0-NEXT:    s_mov_b32 s43, s37
+; GFX9-O0-NEXT:    s_mov_b32 s44, s36
+; GFX9-O0-NEXT:    ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
+; GFX9-O0-NEXT:    s_mov_b32 s45, s43
+; GFX9-O0-NEXT:    s_mov_b32 s46, s42
+; GFX9-O0-NEXT:    s_mov_b32 s47, s35
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s44, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s45, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s46, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s47, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[38:39]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[38:39]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
@@ -634,20 +618,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v6, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v6, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v6, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v6, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v6, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v6, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[48:49], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[48:49]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
@@ -679,7 +663,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT:    s_mov_b32 s33, s48
+; GFX9-O0-NEXT:    s_mov_b32 s33, s50
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -851,27 +835,31 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s34, -1
-; GFX9-O3-NEXT:    s_brev_b32 s35, -2
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    s_mov_b32 s36, -1
+; GFX9-O3-NEXT:    s_brev_b32 s37, -2
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s35
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s36
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s37
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s34
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s35
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s36
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s37
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s34
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s35
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s36
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s37
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
@@ -951,6 +939,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_writelane_b32 v47, s65, 1
 ; GFX9-O0-NEXT:    v_writelane_b32 v47, s66, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v47, s67, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s68, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s69, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s70, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s71, 7
 ; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -1150,37 +1142,43 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71
 ; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -1233,6 +1231,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s27
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s29
+; GFX9-O0-NEXT:    v_readlane_b32 s71, v47, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s70, v47, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s69, v47, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s68, v47, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s67, v47, 3
 ; GFX9-O0-NEXT:    v_readlane_b32 s66, v47, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s65, v47, 1
@@ -1289,30 +1291,36 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:12
 ; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:16
 ; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v33, v2
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v33, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v34, v3
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v35, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v34, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v35, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v36, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v37, v6
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v36, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v37, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v38, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v39, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v38, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v39, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v40, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v41, v10
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index def51f2b16d3e..c1a5810a6b515 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,17 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -63,17 +65,10 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -154,11 +149,12 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -185,12 +181,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -236,34 +228,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT:  .LBB1_2: ; %merge
+; GFX9-O3-NEXT:  ; %bb.2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -361,35 +344,36 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s7
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s9
-; GFX9-O0-NEXT:    s_mov_b32 s16, s8
+; GFX9-O0-NEXT:    s_mov_b32 s3, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s7
-; GFX9-O0-NEXT:    s_mov_b32 s18, s6
+; GFX9-O0-NEXT:    s_mov_b32 s17, s9
+; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    s_not_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -418,13 +402,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -439,30 +423,31 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O3-LABEL: call:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-O3-NEXT:    s_mov_b32 s14, -1
 ; GFX9-O3-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-O3-NEXT:    s_mov_b32 s14, -1
 ; GFX9-O3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-O3-NEXT:    s_add_u32 s12, s12, s3
-; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, 0
-; GFX9-O3-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-O3-NEXT:    s_add_u32 s8, s8, called at rel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s9, s9, called at rel32@hi+12
+; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-O3-NEXT:    s_add_u32 s10, s10, called at rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s11, s11, called at rel32@hi+12
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[8:9]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
@@ -597,35 +582,36 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    s_mov_b32 s8, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s19
+; GFX9-O0-NEXT:    s_mov_b32 s7, s18
+; GFX9-O0-NEXT:    s_mov_b32 s15, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -662,12 +648,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -698,20 +684,19 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
 ; GFX9-O3-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-O3-NEXT:    s_add_u32 s4, s4, called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s5, s5, called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-O3-NEXT:    s_add_u32 s0, s0, called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s1, s1, called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
@@ -817,27 +802,31 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s4, -1
-; GFX9-O3-NEXT:    s_brev_b32 s5, -2
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_mov_b32 s6, -1
+; GFX9-O3-NEXT:    s_brev_b32 s7, -2
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
@@ -896,15 +885,17 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -933,17 +924,10 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
@@ -1024,11 +1008,12 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1055,12 +1040,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT:    s_nop 1
 ; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1106,34 +1087,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT:    s_nop 1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT:  .LBB8_2: ; %merge
+; GFX9-O3-NEXT:  ; %bb.2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1231,35 +1203,36 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s3, s7
-; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT:    s_mov_b32 s7, s9
-; GFX9-O0-NEXT:    s_mov_b32 s16, s8
+; GFX9-O0-NEXT:    s_mov_b32 s3, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s7
-; GFX9-O0-NEXT:    s_mov_b32 s18, s6
+; GFX9-O0-NEXT:    s_mov_b32 s17, s9
+; GFX9-O0-NEXT:    s_mov_b32 s18, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s3
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
 ; GFX9-O0-NEXT:    s_mov_b32 s3, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    s_not_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 56
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1288,13 +1261,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 10
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1309,30 +1282,31 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 ; GFX9-O3-LABEL: strict_wwm_call:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX9-O3-NEXT:    s_mov_b32 s14, -1
 ; GFX9-O3-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-O3-NEXT:    s_mov_b32 s14, -1
 ; GFX9-O3-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-O3-NEXT:    s_add_u32 s12, s12, s3
-; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s32, 0
-; GFX9-O3-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-O3-NEXT:    s_add_u32 s8, s8, strict_wwm_called at rel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s9, s9, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-O3-NEXT:    s_add_u32 s10, s10, strict_wwm_called at rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s11, s11, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[8:9]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[10:11]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_endpgm
@@ -1467,35 +1441,36 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
 ; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s9, 5
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT:    s_mov_b32 s6, s9
-; GFX9-O0-NEXT:    s_mov_b32 s7, s8
-; GFX9-O0-NEXT:    s_mov_b32 s8, s17
+; GFX9-O0-NEXT:    s_mov_b32 s6, s19
+; GFX9-O0-NEXT:    s_mov_b32 s7, s18
+; GFX9-O0-NEXT:    s_mov_b32 s15, s17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
 ; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT:    s_mov_b32 s17, s8
+; GFX9-O0-NEXT:    s_mov_b32 s17, s15
 ; GFX9-O0-NEXT:    s_mov_b32 s18, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s19, s6
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_not_b64 exec, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 60
 ; GFX9-O0-NEXT:    s_mov_b32 s2, s0
 ; GFX9-O0-NEXT:    s_mov_b32 s0, s1
@@ -1532,12 +1507,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1568,20 +1543,19 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
 ; GFX9-O3-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
 ; GFX9-O3-NEXT:    s_mov_b32 s32, 0
 ; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-O3-NEXT:    s_add_u32 s4, s4, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s5, s5, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-O3-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s3
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[6:7]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
 ; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
 ; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
@@ -1687,27 +1661,31 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
 ; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT:    s_mov_b32 s4, -1
-; GFX9-O3-NEXT:    s_brev_b32 s5, -2
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    s_mov_b32 s6, -1
+; GFX9-O3-NEXT:    s_brev_b32 s7, -2
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_not_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT:    s_mov_b64 exec, -1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3



More information about the llvm-commits mailing list