[llvm] a46dba2 - [AMDGPU] Extend macro fusion for ADDC and SUBB to SUBBREV

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 11 10:59:33 PDT 2020


Author: Jay Foad
Date: 2020-03-11T17:59:21Z
New Revision: a46dba24fa35ab52e9a1bbaa52666bcc37859927

URL: https://github.com/llvm/llvm-project/commit/a46dba24fa35ab52e9a1bbaa52666bcc37859927
DIFF: https://github.com/llvm/llvm-project/commit/a46dba24fa35ab52e9a1bbaa52666bcc37859927.diff

LOG: [AMDGPU] Extend macro fusion for ADDC and SUBB to SUBBREV

Summary:
There's a lot of test case churn but the overall effect is to increase
the number of back-to-back v_sub,v_subbrev pairs, which can execute with
no delay even on gfx10.

Reviewers: arsenm, rampitec, nhaehnle

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D75999

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/bypass-div.ll
    llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 8c11230f411a..b05855d1afc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -34,6 +34,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
   switch (SecondMI.getOpcode()) {
   case AMDGPU::V_ADDC_U32_e64:
   case AMDGPU::V_SUBB_U32_e64:
+  case AMDGPU::V_SUBBREV_U32_e64:
   case AMDGPU::V_CNDMASK_B32_e64: {
     // Try to cluster defs of condition registers to their uses. This improves
     // the chance VCC will be available which will allow shrinking to VOP2

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b835365ec0c6..cdfbf5043672 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -5741,7 +5741,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
@@ -5752,7 +5752,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_movk_i32 s5, 0x11e
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
@@ -5760,7 +5760,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GCN-NEXT:    s_mov_b32 s9, s5
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, -1
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
@@ -5796,35 +5797,34 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
+; GCN-NEXT:    s_mov_b32 s6, 0x9761f7c8
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    s_movk_i32 s2, 0x11e
-; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v2
-; GCN-NEXT:    s_mov_b32 s3, 0x9761f7c8
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v0
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
@@ -7018,29 +7018,29 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s14, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
@@ -7225,39 +7225,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
-; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v8, s[0:1], s16, v5
+; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GCN-NEXT:    s_ashr_i32 s2, s15, 31
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
 ; GCN-NEXT:    s_add_u32 s8, s14, s2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v7, s9
 ; GCN-NEXT:    s_mov_b32 s3, s2
 ; GCN-NEXT:    s_addc_u32 s9, s15, s2
 ; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v8, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s9
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_mac_f32_e32 v7, s18, v9
-; GCN-NEXT:    v_rcp_f32_e32 v7, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GCN-NEXT:    v_mac_f32_e32 v8, s18, v9
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
+; GCN-NEXT:    v_rcp_f32_e32 v8, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GCN-NEXT:    v_mul_f32_e32 v3, s19, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v8, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
+; GCN-NEXT:    v_mul_f32_e32 v3, s19, v8
 ; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
@@ -7347,29 +7347,29 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
-; GCN-NEXT:    v_subb_u32_e64 v5, s[2:3], v4, v5, s[0:1]
+; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v8, s[0:1], s8, v6
-; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v7, s11
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 9ac1570fab7c..299ae9083703 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -434,20 +434,20 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v8, v4
 ; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v9, v5, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v4, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v10, s[6:7], v8, v5, s[4:5]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v5
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v5
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v8, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[6:7]
 ; GFX9-NEXT:    v_sub_co_u32_e64 v12, s[4:5], v9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[4:5], 0, v10, s[4:5]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
@@ -455,7 +455,7 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v12, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
@@ -592,22 +592,22 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v3, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v4, v2
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[6:7], v6, v3, s[4:5]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[6:7], 0, v6, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v3, s[4:5]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
 ; GFX9-NEXT:    v_sub_co_u32_e64 v10, s[4:5], v7, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
@@ -809,7 +809,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; GFX9-NEXT:    ; implicit-def: $vgpr3_vgpr4
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz BB8_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
@@ -903,48 +903,48 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v13
 ; GFX9-NEXT:    v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e64 v12, s[4:5], v9, v6
-; GFX9-NEXT:    v_subb_co_u32_e64 v13, s[6:7], v11, v5, s[4:5]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[4:5], 0, v11, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e64 v15, s[4:5], 2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[6:7], 0, v11, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v13, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v13, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 2, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v7, s[6:7]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v17, s[4:5], 1, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[4:5], 0, v7, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v17, s[6:7], 1, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[6:7], 0, v7, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v14
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v14
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v16, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v17, v15, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v7, v8, v3
-; GFX9-NEXT:    v_xor_b32_e32 v3, v4, v7
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[6:7], v3, v7
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[6:7], v5, v7, s[6:7]
-; GFX9-NEXT:    v_sub_co_u32_e64 v5, s[6:7], v12, v6
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[6:7], 0, v13, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v12, v5, s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[4:5], v11, v5, s[4:5]
+; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v12, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v16, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v12, v6, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v17, v15, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v11, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v10, v8, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v3, v4, v10
 ; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v8
+; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v10
+; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[8:9], v3, v10
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v8
 ; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[8:9], v7, v10, s[8:9]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v8, vcc
 ; GFX9-NEXT:  BB8_2: ; %Flow
-; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], s[8:9]
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], s[10:11]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execz BB8_4
 ; GFX9-NEXT:  ; %bb.3:
@@ -1085,35 +1085,35 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v9
 ; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v8, v2
-; GFX9-NEXT:    v_subb_co_u32_e64 v10, s[6:7], v7, v3, s[4:5]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v7, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v12, s[6:7], 2, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[6:7], 0, v5, s[6:7]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v14, s[4:5], 1, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v14, s[6:7], 1, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[6:7], 0, v5, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v15, v13, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v15, v13, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v13, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v14, v12, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[6:7], v9, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[6:7], 0, v10, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v14, v12, s[6:7]
+; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v7, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v9, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v6, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v1, vcc
 ; GFX9-NEXT:  BB9_2: ; %Flow

diff  --git a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
index 42b5f1957302..a394d344cdd6 100644
--- a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
+++ b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir
@@ -147,6 +147,32 @@ body: |
     %4, %5 = V_SUBB_U32_e64 %6, %7, %3, 0, implicit $exec
 ...
 
+# GCN-LABEL: name: cluster_subrev_subbrev
+# GCN: S_NOP 0, implicit-def $vcc
+# GCN: dead %2:vgpr_32, %3:sreg_64_xexec = V_SUBREV_I32_e64 %0, %1, 0, implicit $exec
+# GCN: dead %4:vgpr_32, dead %5:sreg_64_xexec = V_SUBBREV_U32_e64 %6, %7, %3, 0, implicit $exec
+name: cluster_subrev_subbrev
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+
+body: |
+  bb.0:
+    %0 = V_MOV_B32_e32 0, implicit $exec
+    %1 = V_MOV_B32_e32 0, implicit $exec
+    %2, %3 = V_SUBREV_I32_e64 %0, %1, 0, implicit $exec
+    %6 = V_MOV_B32_e32 0, implicit $exec
+    %7 = V_MOV_B32_e32 0, implicit $exec
+    S_NOP 0, implicit def $vcc
+    %4, %5 = V_SUBBREV_U32_e64 %6, %7, %3, 0, implicit $exec
+...
+
 # GCN-LABEL: name: cluster_cmp_cndmask
 # GCN: S_NOP 0, implicit-def $vcc
 # GCN-NEXT: %3:sreg_64_xexec = V_CMP_EQ_I32_e64 %0, %1, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index b31c4d00fb9f..96145858a303 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -97,29 +97,29 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -331,20 +331,20 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v7, v3, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v7, s[4:5], v0, v2
-; GCN-NEXT:    v_subb_u32_e64 v8, s[6:7], v4, v3, s[4:5]
-; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
+; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
 ; GCN-NEXT:    v_sub_i32_e64 v10, s[4:5], v7, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GCN-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
@@ -352,7 +352,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v10, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
@@ -987,29 +987,29 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s14, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v5, s15
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s10, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s10, v1
@@ -1410,28 +1410,28 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
-; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v4
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -1623,27 +1623,27 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v0
-; GCN-NEXT:    v_subb_u32_e64 v6, s[6:7], v4, v1, s[4:5]
-; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v5, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
+; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
 ; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
+; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
-; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GCN-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1834,27 +1834,27 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v0
-; GCN-NEXT:    v_subb_u32_e64 v6, s[6:7], v4, v1, s[4:5]
-; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v5, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
+; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
 ; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
+; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
-; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GCN-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
index 911677478d51..cee3dc60f0c8 100644
--- a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
@@ -4,9 +4,9 @@
 ;
 ; GCN-LABEL: sub_zext_zext:
 ; GCN: ds_read_b32 [[VAL:v[0-9]+]],
-; GCN-DAG: v_cmp_lt_f32{{.*}} [[CC1:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]]
-; GCN-DAG: v_cmp_gt_f32{{.*}} vcc, 0, [[VAL]]
-; GCN: v_cndmask_{{.*}} [[ZEXTCC1:v[0-9]+]], 0, 1, [[CC1]]
+; GCN: v_cmp_lt_f32{{.*}} vcc, 0, [[VAL]]
+; GCN: v_cndmask_{{.*}} [[ZEXTCC1:v[0-9]+]], 0, 1, vcc
+; GCN: v_cmp_gt_f32{{.*}} vcc, 0, [[VAL]]
 ; GCN: v_subbrev{{.*}} {{v[0-9]+}}, vcc, 0, [[ZEXTCC1]], vcc
 ;
 ; Before the reversion that this test is attached to, the compiler commuted

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 8925912f71c7..86b4a39057c3 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1585,18 +1585,18 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 24, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 24, v8
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
-; GCN-IR-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
-; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[0:1], 0, v5, s[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB11_5
 ; GCN-IR-NEXT:  BB11_6: ; %udiv-loop-exit
@@ -1757,25 +1757,25 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v6, v7, v4
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 23, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_add_i32_e64 v9, s[4:5], 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v9, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v3, v10, v3
-; GCN-IR-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, v1, s[4:5]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[9:10], v[0:1]
+; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1]
 ; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v7
 ; GCN-IR-NEXT:    v_and_b32_e32 v7, 24, v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v6, v7
+; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], v6, v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, v5
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v4
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB12_3

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 9a9703c3803c..9df153381d83 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -97,29 +97,29 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s12, v4
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -321,27 +321,27 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v6, v3, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
-; GCN-NEXT:    v_subb_u32_e64 v7, s[6:7], v4, v3, s[4:5]
-; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GCN-NEXT:    v_subbrev_u32_e64 v7, s[6:7], 0, v4, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GCN-NEXT:    v_sub_i32_e64 v9, s[4:5], v6, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
-; GCN-NEXT:    v_sub_i32_e64 v9, s[4:5], v6, v2
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v8
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_subbrev_u32_e64 v2, s[4:5], 0, v7, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[6:7]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -832,28 +832,28 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
 ; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s6, v0
-; GCN-NEXT:    v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1]
-; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s6, v4
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s7, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s6, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s7, v5
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
@@ -1115,18 +1115,18 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 24, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 24, v8
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
-; GCN-IR-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
-; GCN-IR-NEXT:    v_addc_u32_e64 v9, s[0:1], 0, v5, s[0:1]
-; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
-; GCN-IR-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1]
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
 ; GCN-IR-NEXT:  BB7_6: ; %udiv-loop-exit
@@ -1232,27 +1232,27 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
 ; GCN-NEXT:    v_sub_i32_e64 v5, s[4:5], v2, v0
-; GCN-NEXT:    v_subb_u32_e64 v6, s[6:7], v4, v1, s[4:5]
-; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v5, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], v6, v1
+; GCN-NEXT:    v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5]
 ; GCN-NEXT:    v_subb_u32_e32 v3, vcc, 0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[6:7]
+; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5]
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
-; GCN-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; GCN-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v6, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index aae7a22addc4..e3149be899c0 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -336,8 +336,8 @@ bb:
 ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
 ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX1032: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
-; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
+; GFX1032: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
+; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
@@ -346,8 +346,8 @@ bb:
 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}}
-; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}}
+; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}}
+; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}}
 define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {
 bb:
   %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1


        


More information about the llvm-commits mailing list