[llvm] 7a10179 - Revert "[AMDGPU] Mark mbcnt as convergent"

Sameer Sahasrabuddhe via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 30 00:54:12 PDT 2023


Author: Sameer Sahasrabuddhe
Date: 2023-06-30T13:10:44+05:30
New Revision: 7a101798b7539743de08adc9ea7d7f748cab37eb

URL: https://github.com/llvm/llvm-project/commit/7a101798b7539743de08adc9ea7d7f748cab37eb
DIFF: https://github.com/llvm/llvm-project/commit/7a101798b7539743de08adc9ea7d7f748cab37eb.diff

LOG: Revert "[AMDGPU] Mark mbcnt as convergent"

This reverts commit 37114036aa57e53217a57afacd7f47b36114edfb.

The output of mbcnt does not depend on other active lanes, and hence it is not
convergent. The original change was made as a possible fix for

https://github.com/ROCm-Developer-Tools/HIP/issues/3172

But changing mbcnt does not fix that issue.

Reviewed By: ruiling, foad, yaxunl

Differential Revision: https://reviews.llvm.org/D153953

Added: 
    

Modified: 
    clang/test/CodeGenOpenCL/builtins-amdgcn.cl
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 74f3343f79e4a8..8e5d670b7b7880 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -690,14 +690,12 @@ kernel void test_gws_sema_p(uint id) {
 
 // CHECK-LABEL: @test_mbcnt_lo(
 // CHECK: call i32 @llvm.amdgcn.mbcnt.lo(i32 %src0, i32 %src1)
-// CHECK: declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #[[$MBCNT_ATTRS:[0-9]+]]
 kernel void test_mbcnt_lo(global uint* out, uint src0, uint src1) {
   *out = __builtin_amdgcn_mbcnt_lo(src0, src1);
 }
 
 // CHECK-LABEL: @test_mbcnt_hi(
 // CHECK: call i32 @llvm.amdgcn.mbcnt.hi(i32 %src0, i32 %src1)
-// CHECK: declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #[[$MBCNT_ATTRS]]
 kernel void test_mbcnt_hi(global uint* out, uint src0, uint src1) {
   *out = __builtin_amdgcn_mbcnt_hi(src0, src1);
 }
@@ -834,7 +832,6 @@ void test_atomic_inc_dec(local uint *lptr, global uint *gptr, uint val) {
 // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
 // CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
 // CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent }
-// CHECK-DAG: attributes #[[$MBCNT_ATTRS]] = {{.* convergent .*}}
 // CHECK-DAG: ![[$EXEC]] = !{!"exec"}
 // CHECK-DAG: ![[$EXEC_LO]] = !{!"exec_lo"}
 // CHECK-DAG: ![[$EXEC_HI]] = !{!"exec_hi"}

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 29b95620cd3f99..726f0bee648541 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1833,12 +1833,12 @@ def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty],
 def int_amdgcn_mbcnt_lo :
   ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-   [IntrNoMem, IntrConvergent]>;
+   [IntrNoMem]>;
 
 def int_amdgcn_mbcnt_hi :
   ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-            [IntrNoMem, IntrConvergent]>;
+            [IntrNoMem]>;
 
 // llvm.amdgcn.ds.swizzle src offset
 def int_amdgcn_ds_swizzle :

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 5db97156445463..2a738b18f930cd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -449,10 +449,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX8-LABEL: add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -469,7 +467,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -493,10 +493,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -513,7 +511,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -536,10 +536,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W64-LABEL: add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -555,7 +553,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -579,7 +579,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W32-LABEL: add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -594,7 +593,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -618,11 +618,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W64-LABEL: add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -640,9 +637,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -665,7 +666,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -681,9 +681,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11W32-NEXT:  ; %bb.3:
@@ -726,10 +728,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ;
 ; GFX8-LABEL: struct_add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -746,7 +746,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -772,10 +774,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ;
 ; GFX9-LABEL: struct_add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -792,7 +792,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -817,10 +819,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ;
 ; GFX10W64-LABEL: struct_add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -836,7 +836,9 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -863,7 +865,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ;
 ; GFX10W32-LABEL: struct_add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -878,7 +879,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -905,11 +907,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ;
 ; GFX11W64-LABEL: struct_add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -927,9 +926,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -955,7 +958,6 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ;
 ; GFX11W32-LABEL: struct_add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -971,9 +973,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX11W32-NEXT:  ; %bb.3:
@@ -1518,10 +1522,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX8-LABEL: sub_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1538,7 +1540,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1562,10 +1566,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: sub_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1582,7 +1584,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1605,10 +1609,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W64-LABEL: sub_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1624,7 +1626,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1648,7 +1652,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W32-LABEL: sub_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -1663,7 +1666,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -1687,11 +1691,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W64-LABEL: sub_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1709,9 +1710,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -1734,7 +1739,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -1750,9 +1754,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX11W32-NEXT:  ; %bb.3:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 4a90ed5fe74bb7..a3a2fc85123b92 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -516,10 +516,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX8-LABEL: add_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -537,7 +535,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -565,10 +565,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: add_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -586,7 +584,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -614,10 +614,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064-LABEL: add_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -634,7 +632,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -665,7 +665,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032-LABEL: add_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -681,7 +680,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s5, exec_lo, s5
@@ -712,11 +712,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1164-LABEL: add_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s6, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -735,9 +732,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -767,7 +768,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1132-LABEL: add_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -784,9 +784,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -2016,10 +2018,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX8-LABEL: sub_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2037,7 +2037,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2065,10 +2067,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX9-LABEL: sub_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2086,7 +2086,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2114,10 +2116,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1064-LABEL: sub_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2134,7 +2134,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2165,7 +2167,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1032-LABEL: sub_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s4, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -2181,7 +2182,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s5, exec_lo, s5
@@ -2212,11 +2214,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1164-LABEL: sub_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s6, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2235,9 +2234,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -2267,7 +2270,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1132-LABEL: sub_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s4, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -2284,9 +2286,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s5, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX1132-NEXT:    s_cbranch_execz .LBB8_4
 ; GFX1132-NEXT:  ; %bb.3:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 0afa51d497ff40..b3dd9fe64f942e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -491,10 +491,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: add_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -511,7 +509,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -536,10 +536,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: add_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -556,7 +554,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -580,10 +580,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: add_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -599,7 +597,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -626,7 +626,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: add_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -641,7 +640,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -668,11 +668,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: add_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -690,9 +687,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -718,7 +719,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: add_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -734,9 +734,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -778,10 +780,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX8-LABEL: add_i32_varying_nouse:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX8-NEXT:    s_mov_b32 s2, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
@@ -795,7 +795,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execz .LBB3_4
@@ -811,10 +813,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX9-LABEL: add_i32_varying_nouse:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
@@ -828,7 +828,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_4
@@ -843,10 +845,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX1064-LABEL: add_i32_varying_nouse:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:    s_mov_b32 s2, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
@@ -860,7 +860,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB3_4
@@ -877,7 +879,6 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX1032-LABEL: add_i32_varying_nouse:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s0, 0
 ; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -890,7 +891,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX1032-NEXT:    s_cbranch_execz .LBB3_4
@@ -907,11 +909,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX1164-LABEL: add_i32_varying_nouse:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
@@ -927,8 +926,11 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -944,7 +946,6 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ;
 ; GFX1132-LABEL: add_i32_varying_nouse:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s0, 0
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -959,8 +960,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX1132-NEXT:    s_cbranch_execz .LBB3_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -2071,10 +2074,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: sub_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2091,7 +2092,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -2116,10 +2119,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: sub_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2136,7 +2137,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -2160,10 +2163,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: sub_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2179,7 +2180,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -2206,7 +2209,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: sub_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -2221,7 +2223,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -2248,11 +2251,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: sub_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2270,9 +2270,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -2298,7 +2302,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: sub_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -2314,9 +2317,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -2358,10 +2363,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX8-LABEL: sub_i32_varying_nouse:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX8-NEXT:    s_mov_b32 s2, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX8-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
@@ -2375,7 +2378,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execz .LBB10_4
@@ -2391,10 +2396,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX9-LABEL: sub_i32_varying_nouse:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-NEXT:    s_mov_b32 s2, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX9-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
@@ -2408,7 +2411,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_4
@@ -2423,10 +2428,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX1064-LABEL: sub_i32_varying_nouse:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:    s_mov_b32 s2, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX1064-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
@@ -2440,7 +2443,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB10_4
@@ -2457,7 +2462,6 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX1032-LABEL: sub_i32_varying_nouse:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s0, 0
 ; GFX1032-NEXT:  .LBB10_1: ; %ComputeLoop
@@ -2470,7 +2474,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX1032-NEXT:    s_cbranch_execz .LBB10_4
@@ -2487,11 +2492,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX1164-LABEL: sub_i32_varying_nouse:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
 ; GFX1164-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
@@ -2507,8 +2509,11 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -2524,7 +2529,6 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ;
 ; GFX1132-LABEL: sub_i32_varying_nouse:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s0, 0
 ; GFX1132-NEXT:  .LBB10_1: ; %ComputeLoop
@@ -2539,8 +2543,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX1132-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -3206,9 +3212,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: and_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    s_mov_b32 s4, -1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB14_1: ; %ComputeLoop
@@ -3226,7 +3230,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3251,9 +3257,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: and_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    s_mov_b32 s4, -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB14_1: ; %ComputeLoop
@@ -3271,7 +3275,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3295,10 +3301,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: and_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, -1
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3314,7 +3318,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3341,7 +3347,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: and_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -3356,7 +3361,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -3383,11 +3389,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: and_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3405,9 +3408,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -3433,7 +3440,6 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: and_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -3449,9 +3455,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -3499,10 +3507,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: or_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3519,7 +3525,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3544,10 +3552,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: or_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3564,7 +3570,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3588,10 +3596,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: or_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3607,7 +3613,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3634,7 +3642,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: or_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -3649,7 +3656,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -3676,11 +3684,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: or_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3698,9 +3703,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -3726,7 +3735,6 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: or_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -3742,9 +3750,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -3792,10 +3802,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: xor_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3812,7 +3820,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3837,10 +3847,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: xor_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3857,7 +3865,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3881,10 +3891,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: xor_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3900,7 +3908,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -3927,7 +3937,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: xor_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -3942,7 +3951,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -3969,11 +3979,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: xor_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -3991,9 +3998,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -4019,7 +4030,6 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: xor_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -4035,9 +4045,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB16_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -4085,9 +4097,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: max_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    s_brev_b32 s4, 1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB17_1: ; %ComputeLoop
@@ -4105,7 +4115,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -4130,9 +4142,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: max_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    s_brev_b32 s4, 1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB17_1: ; %ComputeLoop
@@ -4150,7 +4160,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -4174,10 +4186,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: max_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_brev_b32 s4, 1
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4193,7 +4203,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -4220,7 +4232,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: max_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_brev_b32 s2, 1
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -4235,7 +4246,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -4262,11 +4274,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: max_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_brev_b32 s4, 1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4284,9 +4293,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB17_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -4312,7 +4325,6 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: max_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_brev_b32 s2, 1
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -4328,9 +4340,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB17_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -4629,9 +4643,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: min_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB19_1: ; %ComputeLoop
@@ -4649,7 +4661,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -4674,9 +4688,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: min_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB19_1: ; %ComputeLoop
@@ -4694,7 +4706,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -4718,10 +4732,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: min_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_brev_b32 s4, -2
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4737,7 +4749,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -4764,7 +4778,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: min_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_brev_b32 s2, -2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -4779,7 +4792,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -4806,11 +4820,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: min_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_brev_b32 s4, -2
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -4828,9 +4839,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB19_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -4856,7 +4871,6 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: min_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_brev_b32 s2, -2
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -4872,9 +4886,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB19_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -5173,10 +5189,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: umax_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5193,7 +5207,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -5218,10 +5234,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: umax_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5238,7 +5252,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -5262,10 +5278,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: umax_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5281,7 +5295,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -5308,7 +5324,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: umax_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -5323,7 +5338,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -5350,11 +5366,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: umax_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5372,9 +5385,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB21_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -5400,7 +5417,6 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: umax_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -5416,9 +5432,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB21_4
 ; GFX1132-NEXT:  ; %bb.3:
@@ -5712,9 +5730,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX8-LABEL: umin_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    s_mov_b32 s4, -1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB23_1: ; %ComputeLoop
@@ -5732,7 +5748,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -5757,9 +5775,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX9-LABEL: umin_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    s_mov_b32 s4, -1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB23_1: ; %ComputeLoop
@@ -5777,7 +5793,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -5801,10 +5819,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1064-LABEL: umin_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_mov_b32 s4, -1
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5820,7 +5836,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -5847,7 +5865,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1032-LABEL: umin_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
@@ -5862,7 +5879,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -5889,11 +5907,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1164-LABEL: umin_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_mov_b32 s4, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -5911,9 +5926,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX1164-NEXT:    s_cbranch_execz .LBB23_4
 ; GFX1164-NEXT:  ; %bb.3:
@@ -5939,7 +5958,6 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ;
 ; GFX1132-LABEL: umin_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
@@ -5955,9 +5973,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX1132-NEXT:    s_cbranch_execz .LBB23_4
 ; GFX1132-NEXT:  ; %bb.3:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e0b664713e86e2..22c3961fa7bd71 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -448,10 +448,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX8-LABEL: add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -468,7 +466,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -492,10 +492,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -512,7 +510,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -535,10 +535,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W64-LABEL: add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -554,7 +552,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -578,7 +578,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W32-LABEL: add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -593,7 +592,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -617,11 +617,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W64-LABEL: add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -639,9 +636,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -664,7 +665,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -680,9 +680,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11W32-NEXT:  ; %bb.3:
@@ -1224,10 +1226,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX8-LABEL: sub_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1244,7 +1244,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1268,10 +1270,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: sub_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1288,7 +1288,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1311,10 +1313,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W64-LABEL: sub_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1330,7 +1330,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1354,7 +1356,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W32-LABEL: sub_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -1369,7 +1370,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -1393,11 +1395,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W64-LABEL: sub_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1415,9 +1414,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -1440,7 +1443,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -1456,9 +1458,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX11W32-NEXT:  ; %bb.3:

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index be0132e335e892..cf1b34454eb282 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -463,10 +463,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX8-LABEL: add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -483,7 +481,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -508,10 +508,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -528,7 +526,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -552,10 +552,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W64-LABEL: add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -571,7 +569,9 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -596,7 +596,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W32-LABEL: add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -611,7 +610,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -636,11 +636,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W64-LABEL: add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -658,9 +655,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -684,7 +685,6 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -700,9 +700,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11W32-NEXT:  ; %bb.3:
@@ -1362,10 +1364,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX8-LABEL: sub_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    s_mov_b32 s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1382,7 +1382,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1407,10 +1409,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX9-LABEL: sub_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1427,7 +1427,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1451,10 +1453,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W64-LABEL: sub_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX10W64-NEXT:    s_mov_b32 s4, 0
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1470,7 +1470,9 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
@@ -1495,7 +1497,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX10W32-LABEL: sub_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX10W32-NEXT:    s_mov_b32 s2, 0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr1
@@ -1510,7 +1511,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
@@ -1535,11 +1537,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W64-LABEL: sub_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11W64-NEXT:    s_mov_b32 s4, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1557,9 +1556,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr0
-; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11W64-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX11W64-NEXT:  ; %bb.3:
@@ -1583,7 +1586,6 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
 ; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr1
@@ -1599,9 +1601,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
-; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11W32-NEXT:    ; implicit-def: $vgpr0
-; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
 ; GFX11W32-NEXT:    s_cbranch_execz .LBB7_4
 ; GFX11W32-NEXT:  ; %bb.3:


        


More information about the llvm-commits mailing list