[llvm] 1416744 - GlobalISel: Implement computeKnownBits for overflow bool results

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 11 16:44:46 PDT 2022


Author: Matt Arsenault
Date: 2022-04-11T19:43:37-04:00
New Revision: 1416744f8405db03096bc240a8ec9de176a71569

URL: https://github.com/llvm/llvm-project/commit/1416744f8405db03096bc240a8ec9de176a71569
DIFF: https://github.com/llvm/llvm-project/commit/1416744f8405db03096bc240a8ec9de176a71569.diff

LOG: GlobalISel: Implement computeKnownBits for overflow bool results

Added: 
    

Modified: 
    llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
    llvm/test/CodeGen/AMDGPU/bfi_int.ll
    llvm/test/CodeGen/AMDGPU/constrained-shift.ll
    llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
    llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 64c2f0d5f8e49..4f03af0fce82d 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -567,6 +567,26 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     Known = KnownBits::ashr(KnownBits::shl(Known, ShiftKnown), ShiftKnown);
     break;
   }
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO: {
+    if (MI.getOperand(1).getReg() == R) {
+      // If we know the result of a compare has the top bits zero, use this
+      // info.
+      if (TL.getBooleanContents(DstTy.isVector(), false) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+    }
+    break;
+  }
   }
 
   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
@@ -673,6 +693,27 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
                                  MI.getOperand(3).getReg(), DemandedElts,
                                  Depth + 1);
   }
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // If compares returns 0/-1, all bits are sign bits.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (MI.getOperand(1).getReg() == R) {
+      if (TL.getBooleanContents(DstTy.isVector(), false) ==
+          TargetLowering::ZeroOrNegativeOneBooleanContent)
+        return TyBits;
+    }
+
+    break;
+  }
   case TargetOpcode::G_INTRINSIC:
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
   default: {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index 352b811b7845b..646705337aabc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -457,7 +457,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s0, s0, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX7-NEXT:    s_and_b32 s1, s1, 1
 ; GFX7-NEXT:    s_add_i32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -465,7 +464,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -473,7 +471,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
   %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
@@ -488,9 +485,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_uaddo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s0, s0, s2
-; GFX7-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX7-NEXT:    s_and_b32 s4, s4, 1
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
@@ -506,9 +500,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-LABEL: s_uaddo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX8-NEXT:    s_and_b32 s4, s4, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -524,9 +515,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-LABEL: s_uaddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX9-NEXT:    s_and_b32 s4, s4, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -553,8 +541,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_add_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX7-NEXT:    s_and_b32 s2, s2, 1
-; GFX7-NEXT:    s_and_b32 s3, s3, 1
 ; GFX7-NEXT:    s_add_i32 s0, s0, s2
 ; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -565,8 +551,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_add_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -577,8 +561,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_add_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_add_i32 s0, s0, s2
 ; GFX9-NEXT:    s_add_i32 s1, s1, s3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -728,9 +710,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_saddo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s4, s0, s2
-; GFX7-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX7-NEXT:    s_and_b32 s5, s5, 1
-; GFX7-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
@@ -748,9 +727,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-LABEL: s_saddo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -768,9 +744,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-LABEL: s_saddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index cca8a9ee86fde..f07d2b83dbf31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -31,9 +31,6 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
 ; GFX:       ; %bb.0:
 ; GFX-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX-NEXT:    s_add_u32 s0, s0, s2
-; GFX-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX-NEXT:    s_and_b32 s4, s4, 1
-; GFX-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX-NEXT:    s_mov_b32 s3, s2
 ; GFX-NEXT:    s_addc_u32 s1, s1, s2
 ; GFX-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 97858b3dae67c..ccf6e6be39be3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -447,7 +447,6 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s5, s0, s5
 ; GFX7-NEXT:    s_add_i32 s0, s2, s7
 ; GFX7-NEXT:    s_add_i32 s0, s0, s5
-; GFX7-NEXT:    s_and_b32 s8, s8, 1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s8, v1
@@ -477,7 +476,6 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s5, s0, s5
 ; GFX8-NEXT:    s_add_i32 s0, s2, s7
 ; GFX8-NEXT:    s_add_i32 s0, s0, s5
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s8, v1
@@ -492,13 +490,11 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mul_i32 s7, s1, s3
 ; GFX9-NEXT:    s_mul_i32 s8, s0, s4
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s3
 ; GFX9-NEXT:    s_add_u32 s7, s7, s8
 ; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s3
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
 ; GFX9-NEXT:    s_add_u32 s7, s7, s9
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
 ; GFX9-NEXT:    s_add_i32 s8, s8, s9
 ; GFX9-NEXT:    s_mul_i32 s2, s2, s3
 ; GFX9-NEXT:    s_mul_i32 s9, s1, s4
@@ -521,17 +517,15 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ; GFX10-NEXT:    s_mul_i32 s7, s0, s4
 ; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s3
 ; GFX10-NEXT:    s_add_u32 s6, s6, s7
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_mul_i32 s2, s2, s3
-; GFX10-NEXT:    s_and_b32 s7, s7, 1
 ; GFX10-NEXT:    s_mul_i32 s9, s1, s4
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_add_u32 s6, s6, s8
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_mul_i32 s5, s0, s5
 ; GFX10-NEXT:    s_add_i32 s2, s2, s9
 ; GFX10-NEXT:    s_mul_hi_u32 s1, s1, s3
 ; GFX10-NEXT:    s_add_i32 s2, s2, s5
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
 ; GFX10-NEXT:    s_mul_hi_u32 s4, s0, s4
 ; GFX10-NEXT:    s_add_i32 s1, s2, s1
 ; GFX10-NEXT:    s_add_i32 s7, s7, s8
@@ -656,24 +650,21 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s9, s1, s4
 ; GFX7-NEXT:    s_mul_i32 s10, s0, s5
 ; GFX7-NEXT:    s_add_u32 s9, s9, s10
-; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s9, v0
-; GFX7-NEXT:    s_and_b32 s10, s10, 1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s10, v1
 ; GFX7-NEXT:    s_mul_i32 s9, s2, s4
 ; GFX7-NEXT:    s_mul_i32 s10, s1, s5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7-NEXT:    s_add_u32 s9, s9, s10
-; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s11, s0, s6
-; GFX7-NEXT:    s_and_b32 s10, s10, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-NEXT:    s_add_u32 s9, s9, s11
+; GFX7-NEXT:    s_add_u32 s9, s9, s10
 ; GFX7-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX7-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX7-NEXT:    s_add_u32 s9, s9, s11
 ; GFX7-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX7-NEXT:    s_and_b32 s11, s11, 1
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s9, v2
 ; GFX7-NEXT:    s_add_i32 s10, s10, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -714,24 +705,21 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s9, s1, s4
 ; GFX8-NEXT:    s_mul_i32 s10, s0, s5
 ; GFX8-NEXT:    s_add_u32 s9, s9, s10
-; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s9, v0
-; GFX8-NEXT:    s_and_b32 s10, s10, 1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s10, v1
 ; GFX8-NEXT:    s_mul_i32 s9, s2, s4
 ; GFX8-NEXT:    s_mul_i32 s10, s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_add_u32 s9, s9, s10
-; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s11, s0, s6
-; GFX8-NEXT:    s_and_b32 s10, s10, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    s_add_u32 s9, s9, s11
+; GFX8-NEXT:    s_add_u32 s9, s9, s10
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s0, v3
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX8-NEXT:    s_add_u32 s9, s9, s11
 ; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_and_b32 s11, s11, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s9, v2
 ; GFX8-NEXT:    s_add_i32 s10, s10, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -769,37 +757,30 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mul_i32 s9, s1, s4
 ; GFX9-NEXT:    s_mul_i32 s10, s0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s4
 ; GFX9-NEXT:    s_add_u32 s9, s9, s10
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s4
-; GFX9-NEXT:    s_and_b32 s10, s10, 1
 ; GFX9-NEXT:    s_add_u32 s9, s9, s11
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_and_b32 s11, s11, 1
 ; GFX9-NEXT:    s_add_i32 s10, s10, s11
 ; GFX9-NEXT:    s_mul_i32 s11, s2, s4
 ; GFX9-NEXT:    s_mul_i32 s12, s1, s5
+; GFX9-NEXT:    s_mul_i32 s13, s0, s6
 ; GFX9-NEXT:    s_add_u32 s11, s11, s12
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_mul_i32 s13, s0, s6
-; GFX9-NEXT:    s_and_b32 s12, s12, 1
 ; GFX9-NEXT:    s_add_u32 s11, s11, s13
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_and_b32 s13, s13, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
 ; GFX9-NEXT:    s_add_i32 s12, s12, s13
 ; GFX9-NEXT:    s_add_u32 s11, s11, s14
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_and_b32 s13, s13, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s15, s0, s5
 ; GFX9-NEXT:    s_add_i32 s12, s12, s13
 ; GFX9-NEXT:    s_add_u32 s11, s11, s15
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_and_b32 s13, s13, 1
 ; GFX9-NEXT:    s_add_i32 s12, s12, s13
 ; GFX9-NEXT:    s_add_u32 s10, s11, s10
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_and_b32 s11, s11, 1
 ; GFX9-NEXT:    s_add_i32 s12, s12, s11
 ; GFX9-NEXT:    s_mul_i32 s3, s3, s4
 ; GFX9-NEXT:    s_mul_i32 s11, s2, s5
@@ -828,52 +809,45 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX10-NEXT:    s_mul_hi_u32 s10, s0, s4
 ; GFX10-NEXT:    s_add_u32 s8, s8, s9
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_mul_i32 s11, s1, s5
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
 ; GFX10-NEXT:    s_add_u32 s8, s8, s10
 ; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    s_mul_i32 s12, s0, s6
-; GFX10-NEXT:    s_and_b32 s10, s10, 1
-; GFX10-NEXT:    s_mul_hi_u32 s13, s1, s4
+; GFX10-NEXT:    s_mul_i32 s11, s1, s5
 ; GFX10-NEXT:    s_add_i32 s9, s9, s10
 ; GFX10-NEXT:    s_mul_i32 s10, s2, s4
-; GFX10-NEXT:    s_mul_i32 s3, s3, s4
+; GFX10-NEXT:    s_mul_i32 s12, s0, s6
 ; GFX10-NEXT:    s_add_u32 s10, s10, s11
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_mul_i32 s7, s0, s7
-; GFX10-NEXT:    s_and_b32 s11, s11, 1
 ; GFX10-NEXT:    s_add_u32 s10, s10, s12
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_and_b32 s12, s12, 1
+; GFX10-NEXT:    s_mul_hi_u32 s13, s1, s4
 ; GFX10-NEXT:    s_add_i32 s11, s11, s12
 ; GFX10-NEXT:    s_add_u32 s10, s10, s13
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s13, s0, s5
-; GFX10-NEXT:    s_and_b32 s12, s12, 1
+; GFX10-NEXT:    s_mul_hi_u32 s14, s0, s5
 ; GFX10-NEXT:    s_add_i32 s11, s11, s12
-; GFX10-NEXT:    s_add_u32 s10, s10, s13
+; GFX10-NEXT:    s_add_u32 s10, s10, s14
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_mul_i32 s13, s1, s6
-; GFX10-NEXT:    s_and_b32 s12, s12, 1
-; GFX10-NEXT:    s_mul_hi_u32 s1, s1, s5
+; GFX10-NEXT:    s_mul_i32 s3, s3, s4
 ; GFX10-NEXT:    s_add_i32 s11, s11, s12
 ; GFX10-NEXT:    s_mul_i32 s12, s2, s5
 ; GFX10-NEXT:    s_add_u32 s9, s10, s9
 ; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_mul_i32 s13, s1, s6
 ; GFX10-NEXT:    s_add_i32 s3, s3, s12
-; GFX10-NEXT:    s_mul_hi_u32 s2, s2, s4
+; GFX10-NEXT:    s_mul_i32 s7, s0, s7
 ; GFX10-NEXT:    s_add_i32 s3, s3, s13
-; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_mul_hi_u32 s2, s2, s4
 ; GFX10-NEXT:    s_add_i32 s3, s3, s7
-; GFX10-NEXT:    s_add_i32 s11, s11, s10
+; GFX10-NEXT:    s_mul_hi_u32 s1, s1, s5
 ; GFX10-NEXT:    s_add_i32 s2, s3, s2
 ; GFX10-NEXT:    s_mul_hi_u32 s3, s0, s6
 ; GFX10-NEXT:    s_add_i32 s1, s2, s1
-; GFX10-NEXT:    s_mul_i32 s0, s0, s4
+; GFX10-NEXT:    s_add_i32 s11, s11, s10
 ; GFX10-NEXT:    s_add_i32 s1, s1, s3
-; GFX10-NEXT:    s_mov_b32 s2, s9
+; GFX10-NEXT:    s_mul_i32 s0, s0, s4
 ; GFX10-NEXT:    s_add_i32 s3, s1, s11
 ; GFX10-NEXT:    s_mov_b32 s1, s8
+; GFX10-NEXT:    s_mov_b32 s2, s9
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = mul i128 %num, %den
   %cast = bitcast i128 %result to <4 x i32>
@@ -1082,189 +1056,168 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s17, s1, s8
 ; GFX7-NEXT:    s_mul_i32 s18, s16, s9
 ; GFX7-NEXT:    s_add_u32 s17, s17, s18
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s17, v0
-; GFX7-NEXT:    s_and_b32 s18, s18, 1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s8
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s18, v1
 ; GFX7-NEXT:    s_mul_i32 s17, s2, s8
 ; GFX7-NEXT:    s_mul_i32 s18, s1, s9
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7-NEXT:    s_add_u32 s17, s17, s18
-; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s8
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s19, s16, s10
-; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s9
-; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_mul_hi_u32 v4, s16, v3
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, s17, v2
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s18, v5
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX7-NEXT:    s_mul_i32 s17, s3, s8
-; GFX7-NEXT:    s_mul_i32 s18, s2, s9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX7-NEXT:    s_mul_i32 s19, s1, s10
-; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    s_mul_i32 s17, s3, s8
+; GFX7-NEXT:    s_mul_i32 s18, s2, s9
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX7-NEXT:    s_mul_i32 s19, s1, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v4, s8
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s20, s16, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
-; GFX7-NEXT:    s_add_u32 s17, s17, s20
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, s17, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX7-NEXT:    v_mul_hi_u32 v7, s16, v6
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s18, v8
-; GFX7-NEXT:    s_mul_i32 s17, s4, s8
-; GFX7-NEXT:    s_mul_i32 s18, s3, s9
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GFX7-NEXT:    s_mul_i32 s19, s2, s10
-; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_mul_i32 s17, s4, s8
+; GFX7-NEXT:    s_mul_i32 s18, s3, s9
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_mul_i32 s19, s2, s10
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX7-NEXT:    s_mul_i32 s20, s1, s11
-; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX7-NEXT:    s_mul_i32 s20, s1, s11
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s3
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_mul_hi_u32 v7, v5, s8
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s21, s16, s12
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s9
 ; GFX7-NEXT:    s_add_u32 s17, s17, s21
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    v_mul_hi_u32 v4, v4, s9
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, s17, v7
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, s18, v11
-; GFX7-NEXT:    s_mul_i32 s17, s5, s8
-; GFX7-NEXT:    s_mul_i32 s18, s4, s9
 ; GFX7-NEXT:    v_mul_hi_u32 v8, s1, v6
-; GFX7-NEXT:    s_add_u32 s17, s17, s18
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, s18, v11
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX7-NEXT:    s_mul_i32 s19, s3, s10
-; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_mul_hi_u32 v10, s16, v9
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_mul_i32 s17, s5, s8
+; GFX7-NEXT:    s_mul_i32 s18, s4, s9
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GFX7-NEXT:    s_mul_i32 s20, s2, s11
-; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_mul_i32 s19, s3, s10
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX7-NEXT:    s_mul_i32 s21, s1, s12
+; GFX7-NEXT:    s_mul_i32 s20, s2, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT:    s_add_u32 s17, s17, s21
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GFX7-NEXT:    s_mul_i32 s21, s1, s12
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s4
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_mul_hi_u32 v8, v7, s8
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_add_u32 s17, s17, s21
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s22, s16, s13
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_mul_hi_u32 v10, v5, s9
 ; GFX7-NEXT:    s_add_u32 s17, s17, s22
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, s17, v8
-; GFX7-NEXT:    v_mul_hi_u32 v10, v5, s9
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GFX7-NEXT:    v_add_i32_e32 v14, vcc, s18, v14
-; GFX7-NEXT:    s_mul_i32 s17, s6, s8
-; GFX7-NEXT:    s_mul_i32 s18, s5, s9
-; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s2, v6
-; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    v_add_i32_e32 v14, vcc, s18, v14
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GFX7-NEXT:    s_mul_i32 s19, s4, s10
-; GFX7-NEXT:    s_and_b32 s18, s18, 1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_mul_hi_u32 v11, s1, v9
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v12, s12
 ; GFX7-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX7-NEXT:    s_mul_i32 s20, s3, s11
-; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    s_mul_i32 s17, s6, s8
+; GFX7-NEXT:    s_mul_i32 s18, s5, s9
 ; GFX7-NEXT:    v_mul_hi_u32 v13, s16, v12
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GFX7-NEXT:    s_add_u32 s17, s17, s20
+; GFX7-NEXT:    s_mul_i32 s19, s4, s10
+; GFX7-NEXT:    s_add_u32 s17, s17, s18
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX7-NEXT:    s_add_u32 s17, s17, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GFX7-NEXT:    s_mul_i32 s21, s2, s12
+; GFX7-NEXT:    s_mul_i32 s20, s3, s11
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; GFX7-NEXT:    s_add_u32 s17, s17, s21
+; GFX7-NEXT:    s_add_u32 s17, s17, s20
 ; GFX7-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GFX7-NEXT:    s_mul_i32 s22, s1, s13
+; GFX7-NEXT:    s_mul_i32 s21, s2, s12
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX7-NEXT:    s_add_u32 s17, s17, s21
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX7-NEXT:    s_add_u32 s17, s17, s22
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX7-NEXT:    s_mul_i32 s22, s1, s13
 ; GFX7-NEXT:    v_mov_b32_e32 v8, s5
-; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_mul_hi_u32 v10, v8, s8
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
+; GFX7-NEXT:    s_add_u32 s17, s17, s22
+; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s23, s16, s14
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
-; GFX7-NEXT:    s_add_u32 s17, s17, s23
 ; GFX7-NEXT:    v_mul_hi_u32 v11, v7, s9
+; GFX7-NEXT:    s_add_u32 s17, s17, s23
 ; GFX7-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX7-NEXT:    s_and_b32 s19, s19, 1
 ; GFX7-NEXT:    v_add_i32_e32 v10, vcc, s17, v10
 ; GFX7-NEXT:    s_add_i32 s18, s18, s19
 ; GFX7-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
@@ -1342,189 +1295,168 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s17, s1, s8
 ; GFX8-NEXT:    s_mul_i32 s18, s16, s9
 ; GFX8-NEXT:    s_add_u32 s17, s17, s18
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s17, v0
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s8
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s18, v1
 ; GFX8-NEXT:    s_mul_i32 s17, s2, s8
 ; GFX8-NEXT:    s_mul_i32 s18, s1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_add_u32 s17, s17, s18
-; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s8
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s19, s16, s10
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s16, v3
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s17, v2
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s18, v5
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
-; GFX8-NEXT:    s_mul_i32 s17, s3, s8
-; GFX8-NEXT:    s_mul_i32 s18, s2, s9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
-; GFX8-NEXT:    s_mul_i32 s19, s1, s10
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    s_mul_i32 s17, s3, s8
+; GFX8-NEXT:    s_mul_i32 s18, s2, s9
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8-NEXT:    s_mul_i32 s19, s1, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v4, s8
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s20, s16, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
-; GFX8-NEXT:    s_add_u32 s17, s17, s20
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v3
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s17, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v7, s16, v6
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s18, v8
-; GFX8-NEXT:    s_mul_i32 s17, s4, s8
-; GFX8-NEXT:    s_mul_i32 s18, s3, s9
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v5
-; GFX8-NEXT:    s_mul_i32 s19, s2, s10
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_mul_i32 s17, s4, s8
+; GFX8-NEXT:    s_mul_i32 s18, s3, s9
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_mul_i32 s19, s2, s10
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    s_mul_i32 s20, s1, s11
-; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    s_mul_i32 s20, s1, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v5, s8
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s21, s16, s12
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s9
 ; GFX8-NEXT:    s_add_u32 s17, s17, s21
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    v_mul_hi_u32 v4, v4, s9
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s17, v7
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s18, v11
-; GFX8-NEXT:    s_mul_i32 s17, s5, s8
-; GFX8-NEXT:    s_mul_i32 s18, s4, s9
 ; GFX8-NEXT:    v_mul_hi_u32 v8, s1, v6
-; GFX8-NEXT:    s_add_u32 s17, s17, s18
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s18, v11
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    s_mul_i32 s19, s3, s10
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_mul_hi_u32 v10, s16, v9
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v11, v7
-; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_mul_i32 s17, s5, s8
+; GFX8-NEXT:    s_mul_i32 s18, s4, s9
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    s_mul_i32 s20, s2, s11
-; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_mul_i32 s19, s3, s10
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v10
-; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    s_mul_i32 s21, s1, s12
+; GFX8-NEXT:    s_mul_i32 s20, s2, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    s_add_u32 s17, s17, s21
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    s_mul_i32 s21, s1, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s4
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v7, s8
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_add_u32 s17, s17, s21
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s22, s16, s13
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_mul_hi_u32 v10, v5, s9
 ; GFX8-NEXT:    s_add_u32 s17, s17, s22
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s17, v8
-; GFX8-NEXT:    v_mul_hi_u32 v10, v5, s9
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s18, v14
-; GFX8-NEXT:    s_mul_i32 s17, s6, s8
-; GFX8-NEXT:    s_mul_i32 s18, s5, s9
-; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s2, v6
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s18, v14
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT:    s_mul_i32 s19, s4, s10
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_mul_hi_u32 v11, s1, v9
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    s_mul_i32 s20, s3, s11
-; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    s_mul_i32 s17, s6, s8
+; GFX8-NEXT:    s_mul_i32 s18, s5, s9
 ; GFX8-NEXT:    v_mul_hi_u32 v13, s16, v12
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
-; GFX8-NEXT:    s_add_u32 s17, s17, s20
+; GFX8-NEXT:    s_mul_i32 s19, s4, s10
+; GFX8-NEXT:    s_add_u32 s17, s17, s18
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_add_u32 s17, s17, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT:    s_mul_i32 s21, s2, s12
+; GFX8-NEXT:    s_mul_i32 s20, s3, s11
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
-; GFX8-NEXT:    s_add_u32 s17, s17, s21
+; GFX8-NEXT:    s_add_u32 s17, s17, s20
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT:    s_mul_i32 s22, s1, s13
+; GFX8-NEXT:    s_mul_i32 s21, s2, s12
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    s_add_u32 s17, s17, s21
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    s_add_u32 s17, s17, s22
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    s_mul_i32 s22, s1, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_mul_hi_u32 v10, v8, s8
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    s_add_u32 s17, s17, s22
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s23, s16, s14
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
-; GFX8-NEXT:    s_add_u32 s17, s17, s23
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v7, s9
+; GFX8-NEXT:    s_add_u32 s17, s17, s23
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s17, v10
 ; GFX8-NEXT:    s_add_i32 s18, s18, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
@@ -1599,233 +1531,186 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_mov_b32 s16, s0
 ; GFX9-NEXT:    s_mul_i32 s17, s1, s8
 ; GFX9-NEXT:    s_mul_i32 s18, s16, s9
+; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s8
 ; GFX9-NEXT:    s_add_u32 s17, s17, s18
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s8
-; GFX9-NEXT:    s_and_b32 s18, s18, 1
 ; GFX9-NEXT:    s_add_u32 s17, s17, s19
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_and_b32 s19, s19, 1
 ; GFX9-NEXT:    s_add_i32 s18, s18, s19
 ; GFX9-NEXT:    s_mul_i32 s19, s2, s8
 ; GFX9-NEXT:    s_mul_i32 s20, s1, s9
+; GFX9-NEXT:    s_mul_i32 s21, s16, s10
 ; GFX9-NEXT:    s_add_u32 s19, s19, s20
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_mul_i32 s21, s16, s10
-; GFX9-NEXT:    s_and_b32 s20, s20, 1
 ; GFX9-NEXT:    s_add_u32 s19, s19, s21
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s22, s1, s8
 ; GFX9-NEXT:    s_add_i32 s20, s20, s21
 ; GFX9-NEXT:    s_add_u32 s19, s19, s22
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
 ; GFX9-NEXT:    s_add_i32 s20, s20, s21
 ; GFX9-NEXT:    s_add_u32 s19, s19, s23
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_add_i32 s20, s20, s21
 ; GFX9-NEXT:    s_add_u32 s18, s19, s18
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_and_b32 s19, s19, 1
 ; GFX9-NEXT:    s_add_i32 s20, s20, s19
 ; GFX9-NEXT:    s_mul_i32 s19, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s21, s2, s9
+; GFX9-NEXT:    s_mul_i32 s22, s1, s10
 ; GFX9-NEXT:    s_add_u32 s19, s19, s21
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_mul_i32 s22, s1, s10
-; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_add_u32 s19, s19, s22
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_mul_i32 s23, s16, s11
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s23
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s24, s2, s8
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s24
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s25, s1, s9
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s25
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s26, s16, s10
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s26
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_add_i32 s21, s21, s22
 ; GFX9-NEXT:    s_add_u32 s19, s19, s20
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_and_b32 s20, s20, 1
 ; GFX9-NEXT:    s_add_i32 s21, s21, s20
 ; GFX9-NEXT:    s_mul_i32 s20, s4, s8
 ; GFX9-NEXT:    s_mul_i32 s22, s3, s9
+; GFX9-NEXT:    s_mul_i32 s23, s2, s10
 ; GFX9-NEXT:    s_add_u32 s20, s20, s22
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_mul_i32 s23, s2, s10
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_add_u32 s20, s20, s23
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s24
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_mul_i32 s25, s16, s12
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s25
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s26, s3, s8
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s26
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s27, s2, s9
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s27
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s28, s1, s10
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s28
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s29, s16, s11
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s29
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_add_i32 s22, s22, s23
 ; GFX9-NEXT:    s_add_u32 s20, s20, s21
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_add_i32 s22, s22, s21
 ; GFX9-NEXT:    s_mul_i32 s21, s5, s8
 ; GFX9-NEXT:    s_mul_i32 s23, s4, s9
+; GFX9-NEXT:    s_mul_i32 s24, s3, s10
 ; GFX9-NEXT:    s_add_u32 s21, s21, s23
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_mul_i32 s24, s3, s10
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_add_u32 s21, s21, s24
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_i32 s25, s2, s11
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s25
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_i32 s26, s1, s12
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s26
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_i32 s27, s16, s13
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s27
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s28, s4, s8
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s28
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s29, s3, s9
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s29
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s30, s2, s10
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s30
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s31, s1, s11
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s31
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s33, s16, s12
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s33
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_add_i32 s23, s23, s24
 ; GFX9-NEXT:    s_add_u32 s21, s21, s22
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX9-NEXT:    s_and_b32 s22, s22, 1
 ; GFX9-NEXT:    s_add_i32 s23, s23, s22
 ; GFX9-NEXT:    s_mul_i32 s22, s6, s8
 ; GFX9-NEXT:    s_mul_i32 s24, s5, s9
+; GFX9-NEXT:    s_mul_i32 s25, s4, s10
 ; GFX9-NEXT:    s_add_u32 s22, s22, s24
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX9-NEXT:    s_mul_i32 s25, s4, s10
-; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_add_u32 s22, s22, s25
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_i32 s26, s3, s11
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s26
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_i32 s27, s2, s12
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s27
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_i32 s28, s1, s13
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s28
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_i32 s29, s16, s14
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s29
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s30, s5, s8
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s30
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s31, s4, s9
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s31
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s33, s3, s10
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s33
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s34, s2, s11
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s34
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s35
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_mul_hi_u32 s36, s16, s13
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s36
 ; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX9-NEXT:    s_and_b32 s25, s25, 1
 ; GFX9-NEXT:    s_add_i32 s24, s24, s25
 ; GFX9-NEXT:    s_add_u32 s22, s22, s23
 ; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_add_i32 s24, s24, s23
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s23, s6, s9
@@ -1873,268 +1758,221 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX10-NEXT:    s_mul_hi_u32 s18, s0, s8
 ; GFX10-NEXT:    s_add_u32 s16, s16, s17
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_mul_i32 s19, s1, s9
-; GFX10-NEXT:    s_and_b32 s17, s17, 1
 ; GFX10-NEXT:    s_add_u32 s16, s16, s18
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_mul_i32 s20, s0, s10
-; GFX10-NEXT:    s_and_b32 s18, s18, 1
-; GFX10-NEXT:    s_mul_hi_u32 s21, s1, s8
+; GFX10-NEXT:    s_mul_i32 s19, s1, s9
 ; GFX10-NEXT:    s_add_i32 s17, s17, s18
 ; GFX10-NEXT:    s_mul_i32 s18, s2, s8
-; GFX10-NEXT:    s_mul_i32 s22, s0, s11
+; GFX10-NEXT:    s_mul_i32 s20, s0, s10
 ; GFX10-NEXT:    s_add_u32 s18, s18, s19
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_mul_i32 s23, s1, s11
-; GFX10-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10-NEXT:    s_add_u32 s18, s18, s20
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_mul_i32 s24, s0, s12
-; GFX10-NEXT:    s_and_b32 s20, s20, 1
-; GFX10-NEXT:    s_mul_i32 s25, s4, s9
+; GFX10-NEXT:    s_mul_hi_u32 s21, s1, s8
 ; GFX10-NEXT:    s_add_i32 s19, s19, s20
 ; GFX10-NEXT:    s_add_u32 s18, s18, s21
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s21, s0, s9
-; GFX10-NEXT:    s_and_b32 s20, s20, 1
-; GFX10-NEXT:    s_mul_i32 s26, s2, s11
+; GFX10-NEXT:    s_mul_hi_u32 s22, s0, s9
 ; GFX10-NEXT:    s_add_i32 s19, s19, s20
-; GFX10-NEXT:    s_add_u32 s18, s18, s21
+; GFX10-NEXT:    s_add_u32 s18, s18, s22
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    s_mul_i32 s21, s1, s10
-; GFX10-NEXT:    s_and_b32 s20, s20, 1
-; GFX10-NEXT:    s_mul_i32 s27, s0, s13
 ; GFX10-NEXT:    s_add_i32 s19, s19, s20
 ; GFX10-NEXT:    s_add_u32 s17, s18, s17
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    s_mul_i32 s20, s2, s9
-; GFX10-NEXT:    s_and_b32 s18, s18, 1
-; GFX10-NEXT:    s_mul_hi_u32 s28, s3, s9
 ; GFX10-NEXT:    s_add_i32 s19, s19, s18
 ; GFX10-NEXT:    s_mul_i32 s18, s3, s8
-; GFX10-NEXT:    s_mul_i32 s7, s7, s8
+; GFX10-NEXT:    s_mul_i32 s22, s0, s11
 ; GFX10-NEXT:    s_add_u32 s18, s18, s20
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_mul_i32 s15, s0, s15
-; GFX10-NEXT:    s_and_b32 s20, s20, 1
 ; GFX10-NEXT:    s_add_u32 s18, s18, s21
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_hi_u32 s23, s2, s8
 ; GFX10-NEXT:    s_add_i32 s20, s20, s21
 ; GFX10-NEXT:    s_add_u32 s18, s18, s22
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s22, s2, s8
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_hi_u32 s24, s1, s9
 ; GFX10-NEXT:    s_add_i32 s20, s20, s21
-; GFX10-NEXT:    s_add_u32 s18, s18, s22
+; GFX10-NEXT:    s_add_u32 s18, s18, s23
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s22, s1, s9
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_hi_u32 s25, s0, s10
 ; GFX10-NEXT:    s_add_i32 s20, s20, s21
-; GFX10-NEXT:    s_add_u32 s18, s18, s22
+; GFX10-NEXT:    s_add_u32 s18, s18, s24
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s22, s0, s10
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_i32 s22, s2, s10
 ; GFX10-NEXT:    s_add_i32 s20, s20, s21
-; GFX10-NEXT:    s_add_u32 s18, s18, s22
+; GFX10-NEXT:    s_add_u32 s18, s18, s25
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_mul_i32 s22, s2, s10
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_i32 s23, s1, s11
 ; GFX10-NEXT:    s_add_i32 s20, s20, s21
 ; GFX10-NEXT:    s_add_u32 s18, s18, s19
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    s_mul_i32 s21, s3, s9
-; GFX10-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10-NEXT:    s_add_i32 s20, s20, s19
 ; GFX10-NEXT:    s_mul_i32 s19, s4, s8
+; GFX10-NEXT:    s_mul_i32 s24, s0, s12
 ; GFX10-NEXT:    s_add_u32 s19, s19, s21
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
 ; GFX10-NEXT:    s_add_u32 s19, s19, s22
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_hi_u32 s25, s3, s8
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
 ; GFX10-NEXT:    s_add_u32 s19, s19, s23
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s23, s3, s8
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_hi_u32 s26, s2, s9
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
 ; GFX10-NEXT:    s_add_u32 s19, s19, s24
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s24, s2, s9
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_hi_u32 s27, s1, s10
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
-; GFX10-NEXT:    s_add_u32 s19, s19, s23
+; GFX10-NEXT:    s_add_u32 s19, s19, s25
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s23, s1, s10
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_hi_u32 s28, s0, s11
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
-; GFX10-NEXT:    s_add_u32 s19, s19, s24
+; GFX10-NEXT:    s_add_u32 s19, s19, s26
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s24, s0, s11
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_i32 s23, s3, s10
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
-; GFX10-NEXT:    s_add_u32 s19, s19, s23
+; GFX10-NEXT:    s_add_u32 s19, s19, s27
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_i32 s23, s5, s8
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_i32 s24, s2, s11
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
-; GFX10-NEXT:    s_add_u32 s19, s19, s24
+; GFX10-NEXT:    s_add_u32 s19, s19, s28
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_i32 s24, s3, s10
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_mul_i32 s25, s1, s12
 ; GFX10-NEXT:    s_add_i32 s21, s21, s22
 ; GFX10-NEXT:    s_add_u32 s19, s19, s20
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_mul_i32 s22, s1, s12
-; GFX10-NEXT:    s_and_b32 s20, s20, 1
+; GFX10-NEXT:    s_mul_i32 s22, s4, s9
 ; GFX10-NEXT:    s_add_i32 s21, s21, s20
-; GFX10-NEXT:    s_add_u32 s23, s23, s25
-; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s20, s4, s8
-; GFX10-NEXT:    s_and_b32 s25, s25, 1
-; GFX10-NEXT:    s_add_u32 s23, s23, s24
-; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
-; GFX10-NEXT:    s_add_i32 s24, s25, s24
-; GFX10-NEXT:    s_add_u32 s23, s23, s26
-; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s26, s2, s10
-; GFX10-NEXT:    s_and_b32 s25, s25, 1
-; GFX10-NEXT:    s_add_i32 s24, s24, s25
-; GFX10-NEXT:    s_add_u32 s22, s23, s22
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s25, s1, s11
-; GFX10-NEXT:    s_and_b32 s23, s23, 1
-; GFX10-NEXT:    s_add_i32 s23, s24, s23
-; GFX10-NEXT:    s_add_u32 s22, s22, s27
-; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s27, s0, s12
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
-; GFX10-NEXT:    s_add_i32 s23, s23, s24
-; GFX10-NEXT:    s_add_u32 s20, s22, s20
+; GFX10-NEXT:    s_mul_i32 s20, s5, s8
+; GFX10-NEXT:    s_mul_i32 s26, s0, s13
+; GFX10-NEXT:    s_add_u32 s20, s20, s22
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10-NEXT:    s_mul_i32 s24, s6, s8
-; GFX10-NEXT:    s_and_b32 s22, s22, 1
-; GFX10-NEXT:    s_add_i32 s22, s23, s22
-; GFX10-NEXT:    s_add_u32 s20, s20, s28
+; GFX10-NEXT:    s_add_u32 s20, s20, s23
 ; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_mul_i32 s28, s5, s9
-; GFX10-NEXT:    s_and_b32 s23, s23, 1
+; GFX10-NEXT:    s_mul_hi_u32 s27, s4, s8
 ; GFX10-NEXT:    s_add_i32 s22, s22, s23
-; GFX10-NEXT:    s_add_u32 s20, s20, s26
+; GFX10-NEXT:    s_add_u32 s20, s20, s24
 ; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_mul_i32 s26, s4, s10
-; GFX10-NEXT:    s_and_b32 s23, s23, 1
+; GFX10-NEXT:    s_mul_hi_u32 s28, s3, s9
 ; GFX10-NEXT:    s_add_i32 s22, s22, s23
 ; GFX10-NEXT:    s_add_u32 s20, s20, s25
 ; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_mul_i32 s25, s3, s11
-; GFX10-NEXT:    s_and_b32 s23, s23, 1
+; GFX10-NEXT:    s_mul_hi_u32 s29, s2, s10
+; GFX10-NEXT:    s_add_i32 s22, s22, s23
+; GFX10-NEXT:    s_add_u32 s20, s20, s26
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mul_hi_u32 s30, s1, s11
 ; GFX10-NEXT:    s_add_i32 s22, s22, s23
 ; GFX10-NEXT:    s_add_u32 s20, s20, s27
 ; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_mul_i32 s27, s2, s12
-; GFX10-NEXT:    s_and_b32 s23, s23, 1
+; GFX10-NEXT:    s_mul_hi_u32 s31, s0, s12
+; GFX10-NEXT:    s_add_i32 s22, s22, s23
+; GFX10-NEXT:    s_add_u32 s20, s20, s28
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mul_i32 s24, s4, s10
+; GFX10-NEXT:    s_add_i32 s22, s22, s23
+; GFX10-NEXT:    s_add_u32 s20, s20, s29
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mul_i32 s25, s3, s11
+; GFX10-NEXT:    s_add_i32 s22, s22, s23
+; GFX10-NEXT:    s_add_u32 s20, s20, s30
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mul_i32 s26, s2, s12
+; GFX10-NEXT:    s_add_i32 s22, s22, s23
+; GFX10-NEXT:    s_add_u32 s20, s20, s31
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mul_i32 s27, s1, s13
 ; GFX10-NEXT:    s_add_i32 s22, s22, s23
 ; GFX10-NEXT:    s_add_u32 s20, s20, s21
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_mul_i32 s23, s1, s13
-; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_i32 s23, s5, s9
 ; GFX10-NEXT:    s_add_i32 s22, s22, s21
-; GFX10-NEXT:    s_add_u32 s21, s24, s28
-; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_mul_i32 s21, s6, s8
 ; GFX10-NEXT:    s_mul_i32 s28, s0, s14
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
-; GFX10-NEXT:    s_add_u32 s21, s21, s26
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX10-NEXT:    s_and_b32 s26, s26, 1
-; GFX10-NEXT:    s_add_i32 s24, s24, s26
-; GFX10-NEXT:    s_add_u32 s21, s21, s25
-; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s26, s5, s8
-; GFX10-NEXT:    s_and_b32 s25, s25, 1
-; GFX10-NEXT:    s_add_i32 s24, s24, s25
-; GFX10-NEXT:    s_add_u32 s21, s21, s27
-; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s27, s4, s9
-; GFX10-NEXT:    s_and_b32 s25, s25, 1
-; GFX10-NEXT:    s_add_i32 s24, s24, s25
 ; GFX10-NEXT:    s_add_u32 s21, s21, s23
 ; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s25, s3, s10
-; GFX10-NEXT:    s_and_b32 s23, s23, 1
-; GFX10-NEXT:    s_add_i32 s23, s24, s23
-; GFX10-NEXT:    s_add_u32 s21, s21, s28
+; GFX10-NEXT:    s_add_u32 s21, s21, s24
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s28, s2, s11
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
+; GFX10-NEXT:    s_mul_hi_u32 s29, s5, s8
+; GFX10-NEXT:    s_add_i32 s23, s23, s24
+; GFX10-NEXT:    s_add_u32 s21, s21, s25
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_mul_hi_u32 s30, s4, s9
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
 ; GFX10-NEXT:    s_add_u32 s21, s21, s26
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s26, s1, s12
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
+; GFX10-NEXT:    s_mul_hi_u32 s31, s3, s10
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
 ; GFX10-NEXT:    s_add_u32 s21, s21, s27
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_hi_u32 s27, s0, s13
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
+; GFX10-NEXT:    s_mul_hi_u32 s33, s2, s11
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
-; GFX10-NEXT:    s_add_u32 s21, s21, s25
+; GFX10-NEXT:    s_add_u32 s21, s21, s28
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_i32 s25, s6, s9
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
-; GFX10-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX10-NEXT:    s_mul_hi_u32 s34, s1, s12
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
-; GFX10-NEXT:    s_add_u32 s21, s21, s28
+; GFX10-NEXT:    s_add_u32 s21, s21, s29
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
+; GFX10-NEXT:    s_mul_hi_u32 s35, s0, s13
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
-; GFX10-NEXT:    s_add_u32 s21, s21, s26
+; GFX10-NEXT:    s_add_u32 s21, s21, s30
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_mul_i32 s7, s7, s8
+; GFX10-NEXT:    s_add_i32 s23, s23, s24
+; GFX10-NEXT:    s_add_u32 s21, s21, s31
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_mul_i32 s25, s5, s10
+; GFX10-NEXT:    s_add_i32 s23, s23, s24
+; GFX10-NEXT:    s_add_u32 s21, s21, s33
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_mul_i32 s15, s0, s15
+; GFX10-NEXT:    s_add_i32 s23, s23, s24
+; GFX10-NEXT:    s_add_u32 s21, s21, s34
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_i32 s26, s5, s10
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
 ; GFX10-NEXT:    s_mul_hi_u32 s5, s5, s9
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
-; GFX10-NEXT:    s_add_u32 s21, s21, s27
+; GFX10-NEXT:    s_add_u32 s21, s21, s35
 ; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10-NEXT:    s_mul_i32 s27, s4, s11
-; GFX10-NEXT:    s_and_b32 s24, s24, 1
-; GFX10-NEXT:    s_mul_hi_u32 s4, s4, s10
 ; GFX10-NEXT:    s_add_i32 s23, s23, s24
+; GFX10-NEXT:    s_mul_i32 s24, s6, s9
 ; GFX10-NEXT:    s_add_u32 s21, s21, s22
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_add_i32 s7, s7, s24
+; GFX10-NEXT:    s_mul_i32 s24, s4, s11
 ; GFX10-NEXT:    s_add_i32 s7, s7, s25
-; GFX10-NEXT:    s_mul_i32 s24, s3, s12
-; GFX10-NEXT:    s_add_i32 s7, s7, s26
-; GFX10-NEXT:    s_mul_i32 s25, s2, s13
-; GFX10-NEXT:    s_add_i32 s7, s7, s27
-; GFX10-NEXT:    s_mul_i32 s26, s1, s14
+; GFX10-NEXT:    s_mul_i32 s25, s3, s12
 ; GFX10-NEXT:    s_add_i32 s7, s7, s24
-; GFX10-NEXT:    s_mul_hi_u32 s3, s3, s11
+; GFX10-NEXT:    s_mul_i32 s24, s2, s13
 ; GFX10-NEXT:    s_add_i32 s7, s7, s25
-; GFX10-NEXT:    s_mul_hi_u32 s2, s2, s12
-; GFX10-NEXT:    s_add_i32 s7, s7, s26
-; GFX10-NEXT:    s_mul_hi_u32 s1, s1, s13
+; GFX10-NEXT:    s_mul_i32 s25, s1, s14
+; GFX10-NEXT:    s_add_i32 s7, s7, s24
+; GFX10-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX10-NEXT:    s_add_i32 s7, s7, s25
+; GFX10-NEXT:    s_mul_hi_u32 s4, s4, s10
 ; GFX10-NEXT:    s_add_i32 s7, s7, s15
+; GFX10-NEXT:    s_mul_hi_u32 s3, s3, s11
 ; GFX10-NEXT:    s_add_i32 s6, s7, s6
+; GFX10-NEXT:    s_mul_hi_u32 s2, s2, s12
 ; GFX10-NEXT:    s_add_i32 s5, s6, s5
-; GFX10-NEXT:    s_mov_b32 s6, s21
+; GFX10-NEXT:    s_mul_hi_u32 s1, s1, s13
 ; GFX10-NEXT:    s_add_i32 s4, s5, s4
-; GFX10-NEXT:    s_mov_b32 s5, s20
+; GFX10-NEXT:    s_add_i32 s23, s23, s22
 ; GFX10-NEXT:    s_add_i32 s3, s4, s3
-; GFX10-NEXT:    s_mul_hi_u32 s4, s0, s14
+; GFX10-NEXT:    s_mov_b32 s4, s19
 ; GFX10-NEXT:    s_add_i32 s2, s3, s2
-; GFX10-NEXT:    s_and_b32 s3, s22, 1
+; GFX10-NEXT:    s_mul_hi_u32 s3, s0, s14
 ; GFX10-NEXT:    s_add_i32 s1, s2, s1
-; GFX10-NEXT:    s_add_i32 s23, s23, s3
-; GFX10-NEXT:    s_add_i32 s1, s1, s4
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s8
+; GFX10-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-NEXT:    s_mov_b32 s2, s17
 ; GFX10-NEXT:    s_add_i32 s7, s1, s23
 ; GFX10-NEXT:    s_mov_b32 s1, s16
-; GFX10-NEXT:    s_mov_b32 s2, s17
 ; GFX10-NEXT:    s_mov_b32 s3, s18
-; GFX10-NEXT:    s_mov_b32 s4, s19
+; GFX10-NEXT:    s_mov_b32 s5, s20
+; GFX10-NEXT:    s_mov_b32 s6, s21
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = mul i256 %num, %den
   %cast = bitcast i256 %result to <8 x i32>

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 642b20879ea04..db72cf406c9cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4217,9 +4217,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s4, s0, s2
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_and_b32 s5, s5, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4243,9 +4240,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-LABEL: s_saddsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4269,9 +4263,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-LABEL: s_saddsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4295,15 +4286,12 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-LABEL: s_saddsat_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s4, s0, s2
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s1, s3
-; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_xor_b32 s2, s2, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
@@ -4559,9 +4547,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_saddsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s8, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4572,16 +4557,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_addc_u32 s1, s4, s5
-; GFX6-NEXT:    s_add_u32 s0, s2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_addc_u32 s1, s4, s5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_add_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -4608,9 +4590,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-LABEL: s_saddsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s8, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4621,16 +4600,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX8-NEXT:    s_addc_u32 s1, s4, s5
-; GFX8-NEXT:    s_add_u32 s0, s2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_addc_u32 s1, s4, s5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_add_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -4657,9 +4633,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-LABEL: s_saddsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s8, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4670,16 +4643,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    s_brev_b32 s5, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_addc_u32 s1, s4, s5
-; GFX9-NEXT:    s_add_u32 s0, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_addc_u32 s1, s4, s5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_add_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -4706,32 +4676,26 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-LABEL: s_saddsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s8, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[4:5], 0
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
 ; GFX10-NEXT:    s_mov_b32 s11, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX10-NEXT:    s_brev_b32 s10, 1
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
-; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    s_xor_b32 s8, s4, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
 ; GFX10-NEXT:    s_addc_u32 s1, s0, s10
 ; GFX10-NEXT:    s_add_u32 s4, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s3, s7
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
 ; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
 ; GFX10-NEXT:    s_xor_b32 s2, s3, s2
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
@@ -4750,19 +4714,10 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s4, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_addc_u32 s5, s1, s5
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_addc_u32 s8, s2, s6
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
+; GFX6-NEXT:    s_addc_u32 s5, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX6-NEXT:    s_addc_u32 s8, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s9, s3, s7
@@ -4779,15 +4734,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s1, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
@@ -4812,18 +4761,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_saddsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s5
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX8-NEXT:    s_addc_u32 s8, s2, s6
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX8-NEXT:    s_addc_u32 s8, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -4845,17 +4785,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s2, s0, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
@@ -4880,18 +4814,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_saddsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s5
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX9-NEXT:    s_addc_u32 s8, s2, s6
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_addc_u32 s8, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -4913,17 +4838,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s0, 0
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s2, s0, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
@@ -4948,60 +4867,45 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_saddsat_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s4, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s10, s[6:7], 0
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s8, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
 ; GFX10-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[8:9], s[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s10
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX10-NEXT:    s_mov_b32 s1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    s_addc_u32 s1, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s3, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
@@ -5527,19 +5431,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_saddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s8, s0, s8
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_and_b32 s16, s16, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_addc_u32 s9, s1, s9
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_and_b32 s16, s16, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_addc_u32 s16, s2, s10
-; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    s_and_b32 s17, s17, 1
+; GFX6-NEXT:    s_addc_u32 s9, s1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_addc_u32 s16, s2, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s17, s3, s11
@@ -5551,50 +5446,35 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT:    s_brev_b32 s10, 1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s1, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6-NEXT:    s_brev_b32 s10, 1
 ; GFX6-NEXT:    s_addc_u32 s2, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_addc_u32 s3, s0, s10
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    s_add_u32 s0, s4, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s1, s5, s13
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s9
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX6-NEXT:    s_add_u32 s0, s4, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX6-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s7, s15
@@ -5611,15 +5491,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s5, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX6-NEXT:    s_and_b32 s6, s6, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s6, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s7, s7, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_addc_u32 s7, s4, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
@@ -5648,18 +5522,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_saddsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s8, s0, s8
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_and_b32 s16, s16, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_addc_u32 s9, s1, s9
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_and_b32 s16, s16, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_addc_u32 s16, s2, s10
-; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_and_b32 s17, s17, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_addc_u32 s16, s2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -5681,46 +5546,31 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s0, 0
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_brev_b32 s10, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    s_addc_u32 s2, s0, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_addc_u32 s3, s0, s10
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    s_add_u32 s0, s4, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s1, s5, s13
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s9
-; GFX8-NEXT:    s_addc_u32 s2, s6, s14
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s17
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_add_u32 s0, s4, s12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -5742,17 +5592,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    s_mov_b32 s5, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s5, s4, 0
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    s_and_b32 s6, s6, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_addc_u32 s6, s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s7, s7, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_addc_u32 s7, s4, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
@@ -5781,18 +5625,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_saddsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s8, s0, s8
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_and_b32 s16, s16, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_addc_u32 s9, s1, s9
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_and_b32 s16, s16, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_addc_u32 s16, s2, s10
-; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_and_b32 s17, s17, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_addc_u32 s16, s2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -5814,46 +5649,31 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s0, 0
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s0, 0
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_brev_b32 s10, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_addc_u32 s2, s0, 0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_addc_u32 s3, s0, s10
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_add_u32 s0, s4, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s13
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-NEXT:    s_addc_u32 s2, s6, s14
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_add_u32 s0, s4, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
@@ -5875,17 +5695,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s5, s4, 0
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    s_and_b32 s6, s6, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_addc_u32 s6, s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s7, s7, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_addc_u32 s7, s4, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
@@ -5914,25 +5728,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-LABEL: s_saddsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s8, s0, s8
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_and_b32 s16, s16, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s1, s9
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT:    s_and_b32 s16, s16, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_addc_u32 s16, s2, s10
-; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s17, s17, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
 ; GFX10-NEXT:    s_addc_u32 s17, s3, s11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s17
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s18
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
@@ -5940,91 +5745,70 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
-; GFX10-NEXT:    s_brev_b32 s10, 1
+; GFX10-NEXT:    s_ashr_i32 s2, s17, 31
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_brev_b32 s11, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX10-NEXT:    s_ashr_i32 s0, s17, 31
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_addc_u32 s1, s0, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_addc_u32 s1, s2, 0
+; GFX10-NEXT:    s_addc_u32 s10, s2, 0
+; GFX10-NEXT:    s_addc_u32 s3, s2, s11
+; GFX10-NEXT:    s_add_u32 s12, s4, s12
+; GFX10-NEXT:    s_addc_u32 s13, s5, s13
+; GFX10-NEXT:    s_addc_u32 s18, s6, s14
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[12:13], s[4:5]
+; GFX10-NEXT:    s_addc_u32 s19, s7, s15
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, s[14:15], 0
+; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[6:7]
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s8
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[18:19], s[6:7]
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_and_b32 s4, 1, s4
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_addc_u32 s3, s0, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT:    s_add_u32 s0, s4, s12
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, s9
+; GFX10-NEXT:    v_mov_b32_e32 v6, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v7, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    s_ashr_i32 s0, s19, 31
+; GFX10-NEXT:    v_xor_b32_e32 v2, v3, v2
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_addc_u32 s1, s5, s13
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s3, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, s12
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, s11
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, s1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[14:15], 0
-; GFX10-NEXT:    s_addc_u32 s8, s6, s14
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_mov_b32_e32 v7, s8
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
-; GFX10-NEXT:    s_addc_u32 s9, s7, s15
-; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, s9
-; GFX10-NEXT:    s_and_b32 s2, 1, s2
-; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, 1, s3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, s3
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, 0, s2
-; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    s_addc_u32 s3, s2, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    s_addc_u32 s4, s2, 0
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX10-NEXT:    s_addc_u32 s1, s2, s10
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, s1, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, s3, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0b1105fba0eba..d4378da215ee5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -208,14 +208,8 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
 ; CHECK-NEXT:    s_ashr_i32 s8, s5, 31
 ; CHECK-NEXT:    s_add_u32 s0, s2, s6
-; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
-; CHECK-NEXT:    s_and_b32 s1, s1, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_addc_u32 s1, s3, s6
 ; CHECK-NEXT:    s_add_u32 s10, s4, s8
-; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
-; CHECK-NEXT:    s_and_b32 s3, s3, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_addc_u32 s11, s5, s8
 ; CHECK-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
@@ -226,21 +220,18 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_sub_u32 s0, 0, s10
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
-; CHECK-NEXT:    s_and_b32 s1, s1, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_subb_u32 s1, 0, s11
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s1, 0, s11
-; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, s0, v1
+; CHECK-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s0, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -1196,43 +1187,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_movk_i32 s10, 0x1000
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1256,7 +1242,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -1327,15 +1312,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v8
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
@@ -1347,25 +1329,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s7
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1912,43 +1891,38 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s10, 0x12d8fb
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1972,7 +1946,6 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -2043,15 +2016,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v8
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
@@ -2063,25 +2033,22 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s7
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 25eae693c1634..5d773c3d9c5ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -150,14 +150,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX8-NEXT:    s_add_u32 s0, s8, s2
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX8-NEXT:    s_add_u32 s8, s10, s12
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_mov_b32 s13, s12
 ; GFX8-NEXT:    s_addc_u32 s9, s11, s12
 ; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
@@ -169,8 +163,7 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_sub_u32 s0, 0, s8
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -178,8 +171,6 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
@@ -329,14 +320,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX9-NEXT:    s_add_u32 s0, s8, s2
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX9-NEXT:    s_add_u32 s8, s10, s12
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    s_addc_u32 s9, s11, s12
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
@@ -348,8 +333,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s8
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -357,27 +342,24 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -499,27 +481,18 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX10-NEXT:    s_add_u32 s0, s8, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_mov_b32 s13, s12
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX10-NEXT:    s_add_u32 s8, s10, s12
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
+; GFX10-NEXT:    s_mov_b32 s13, s12
 ; GFX10-NEXT:    s_addc_u32 s9, s11, s12
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_mov_b32 s3, s2
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX10-NEXT:    s_sub_u32 s10, 0, s8
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_and_b32 s11, s11, 1
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    s_subb_u32 s11, 0, s9
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1335,14 +1308,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX8-NEXT:    s_ashr_i32 s6, s13, 31
 ; GFX8-NEXT:    s_add_u32 s0, s8, s2
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX8-NEXT:    s_add_u32 s8, s12, s6
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_mov_b32 s7, s6
 ; GFX8-NEXT:    s_addc_u32 s9, s13, s6
 ; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
@@ -1354,8 +1321,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_sub_u32 s0, 0, s8
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1363,8 +1329,6 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
@@ -1496,14 +1460,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    s_add_u32 s0, s10, s6
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s11, s6
 ; GFX8-NEXT:    s_add_u32 s10, s14, s8
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_mov_b32 s9, s8
 ; GFX8-NEXT:    s_addc_u32 s11, s15, s8
 ; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
@@ -1516,8 +1474,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX8-NEXT:    s_sub_u32 s0, 0, s10
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_subb_u32 s1, 0, s11
+; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
 ; GFX8-NEXT:    v_trunc_f32_e32 v6, v6
@@ -1525,17 +1483,14 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v4, v7, v4
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s11
+; GFX8-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s1, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s0, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v9, s0, v7
-; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v10
-; GFX8-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_mul_lo_u32 v10, v6, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s2, v3
@@ -1683,14 +1638,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX9-NEXT:    s_ashr_i32 s6, s13, 31
 ; GFX9-NEXT:    s_add_u32 s0, s8, s2
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX9-NEXT:    s_add_u32 s8, s12, s6
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_mov_b32 s7, s6
 ; GFX9-NEXT:    s_addc_u32 s9, s13, s6
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
@@ -1702,8 +1651,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s8
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1711,27 +1659,24 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -1745,6 +1690,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -1826,14 +1772,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
 ; GFX9-NEXT:    s_ashr_i32 s8, s15, 31
 ; GFX9-NEXT:    s_add_u32 s12, s10, s6
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_addc_u32 s13, s11, s6
 ; GFX9-NEXT:    s_add_u32 s10, s14, s8
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_addc_u32 s11, s15, s8
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
@@ -1858,14 +1798,11 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v4, v6, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
-; GFX9-NEXT:    s_and_b32 s14, s14, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_subb_u32 s14, 0, s11
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s3, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
@@ -2015,321 +1952,303 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX10-NEXT:    s_ashr_i32 s6, s13, 31
 ; GFX10-NEXT:    s_add_u32 s0, s8, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_mov_b32 s7, s6
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX10-NEXT:    s_add_u32 s8, s12, s6
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
+; GFX10-NEXT:    s_mov_b32 s7, s6
 ; GFX10-NEXT:    s_addc_u32 s9, s13, s6
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_mov_b32 s3, s2
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX10-NEXT:    s_sub_u32 s20, 0, s8
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT:    s_and_b32 s12, s12, 1
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    s_subb_u32 s21, 0, s9
 ; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX10-NEXT:    s_xor_b64 s[18:19], s[2:3], s[6:7]
 ; GFX10-NEXT:    s_ashr_i32 s16, s15, 31
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_add_u32 s6, s10, s12
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_mov_b32 s17, s16
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_mov_b32 s13, s12
 ; GFX10-NEXT:    s_addc_u32 s7, s11, s12
 ; GFX10-NEXT:    s_add_u32 s10, s14, s16
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX10-NEXT:    s_mov_b32 s17, s16
 ; GFX10-NEXT:    s_addc_u32 s11, s15, s16
-; GFX10-NEXT:    s_xor_b64 s[14:15], s[6:7], s[12:13]
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_xor_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT:    s_mov_b32 s13, s12
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_xor_b64 s[14:15], s[6:7], s[12:13]
 ; GFX10-NEXT:    s_sub_u32 s3, 0, s10
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX10-NEXT:    s_subb_u32 s6, 0, s11
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    s_subb_u32 s6, 0, s11
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v2
+; GFX10-NEXT:    v_mul_lo_u32 v5, s20, v2
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v4
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s21, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s20, v0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s20, v0
-; GFX10-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v1
-; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v5
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v7
-; GFX10-NEXT:    v_mul_lo_u32 v5, v2, v6
-; GFX10-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX10-NEXT:    v_mul_lo_u32 v8, v0, v3
-; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v3
-; GFX10-NEXT:    v_mul_f32_e32 v9, 0xcf800000, v4
-; GFX10-NEXT:    v_mul_hi_u32 v11, v0, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX10-NEXT:    v_add_f32_e32 v1, v9, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v6, s7, v10, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v7
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v6, s21, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s20, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX10-NEXT:    v_mul_lo_u32 v4, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s3, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
-; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v4
+; GFX10-NEXT:    v_add3_u32 v5, v6, v5, v7
+; GFX10-NEXT:    v_mul_lo_u32 v6, v2, v4
+; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v1
+; GFX10-NEXT:    v_mul_hi_u32 v9, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, v0, v5
+; GFX10-NEXT:    v_mul_hi_u32 v11, v0, v4
+; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX10-NEXT:    v_mul_lo_u32 v13, v2, v5
+; GFX10-NEXT:    v_mul_lo_u32 v10, s3, v1
+; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v5
+; GFX10-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v8, v9
+; GFX10-NEXT:    v_add_co_u32 v6, s7, v6, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v4, s7, v13, v4
+; GFX10-NEXT:    v_mul_lo_u32 v8, v3, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s7
+; GFX10-NEXT:    v_mul_lo_u32 v15, v1, v7
 ; GFX10-NEXT:    v_add_co_u32 v6, s7, v6, v11
-; GFX10-NEXT:    v_mul_lo_u32 v12, s6, v1
-; GFX10-NEXT:    v_mul_hi_u32 v13, s3, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s7
-; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v6, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_mul_hi_u32 v9, v1, v10
+; GFX10-NEXT:    v_mul_hi_u32 v10, v3, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s7
-; GFX10-NEXT:    v_add3_u32 v8, v12, v9, v13
-; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v11
-; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v11
-; GFX10-NEXT:    v_mul_hi_u32 v11, v4, v11
-; GFX10-NEXT:    v_add3_u32 v3, v7, v6, v3
-; GFX10-NEXT:    v_mul_lo_u32 v6, v1, v8
-; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v8
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v8
-; GFX10-NEXT:    v_mul_lo_u32 v12, s21, v0
-; GFX10-NEXT:    v_add_co_u32 v6, s7, v9, v6
-; GFX10-NEXT:    v_mul_hi_u32 v13, s20, v0
-; GFX10-NEXT:    v_mul_lo_u32 v14, s20, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v7, s7, v7, v11
+; GFX10-NEXT:    v_add_co_u32 v4, s7, v4, v14
+; GFX10-NEXT:    v_mul_lo_u32 v14, v3, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v6, s7, v6, v10
-; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v12, v6
+; GFX10-NEXT:    v_add_co_u32 v8, s7, v8, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s7
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v13, v11
+; GFX10-NEXT:    v_mul_hi_u32 v16, v1, v7
+; GFX10-NEXT:    v_add_co_u32 v10, s7, v14, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v4, s7, v4, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v7, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s7
-; GFX10-NEXT:    v_add3_u32 v12, v12, v14, v13
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v9, v6
-; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
-; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
-; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v12
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v6
-; GFX10-NEXT:    v_mul_hi_u32 v9, v0, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s7
-; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX10-NEXT:    v_mul_lo_u32 v13, v2, v12
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v5
-; GFX10-NEXT:    v_add_co_u32 v5, s7, v10, v11
-; GFX10-NEXT:    v_add3_u32 v6, v7, v6, v8
-; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v12
-; GFX10-NEXT:    v_mul_lo_u32 v10, s6, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s6, v5, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s7
-; GFX10-NEXT:    v_add_co_u32 v3, s7, v13, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
-; GFX10-NEXT:    v_mul_hi_u32 v11, s3, v1
-; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v14
-; GFX10-NEXT:    v_mul_lo_u32 v13, s3, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v5
+; GFX10-NEXT:    v_add_co_u32 v8, s7, v8, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s6
-; GFX10-NEXT:    v_mul_hi_u32 v7, v2, v12
-; GFX10-NEXT:    v_mul_lo_u32 v6, s3, v1
-; GFX10-NEXT:    v_add_co_u32 v3, s3, v3, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    v_add3_u32 v9, v10, v13, v11
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_mul_lo_u32 v10, v4, v6
-; GFX10-NEXT:    v_add3_u32 v5, v8, v5, v7
-; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v9
-; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v6
-; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX10-NEXT:    v_mul_lo_u32 v8, v4, v9
+; GFX10-NEXT:    v_add_co_u32 v9, s7, v10, v16
+; GFX10-NEXT:    v_add3_u32 v5, v11, v6, v5
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v9
-; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GFX10-NEXT:    v_add_co_u32 v7, s3, v10, v7
-; GFX10-NEXT:    v_mul_lo_u32 v13, s0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v7, v3, v7
+; GFX10-NEXT:    v_add_co_u32 v4, s7, v9, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v13, v10
+; GFX10-NEXT:    v_mul_lo_u32 v5, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s21, v0
+; GFX10-NEXT:    v_mul_hi_u32 v10, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s20, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
+; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v5
+; GFX10-NEXT:    v_add3_u32 v6, v6, v8, v7
+; GFX10-NEXT:    v_mul_lo_u32 v7, v2, v5
+; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v5
+; GFX10-NEXT:    v_add3_u32 v5, v9, v11, v10
+; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v10, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, v0, v5
+; GFX10-NEXT:    v_mul_lo_u32 v13, v2, v5
+; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v6, s3, v1
+; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v5
+; GFX10-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GFX10-NEXT:    v_add_co_u32 v7, s3, v7, v12
+; GFX10-NEXT:    v_add3_u32 v9, v9, v11, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v6, s3, v8, v6
-; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v14, s1, v2
+; GFX10-NEXT:    v_add_co_u32 v4, s3, v13, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v7, s3, v7, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v4, s3, v4, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v7, s3, v7, v11
+; GFX10-NEXT:    v_mul_lo_u32 v15, v3, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_mul_lo_u32 v12, v1, v9
+; GFX10-NEXT:    v_mul_hi_u32 v16, v1, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v11, v8
+; GFX10-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GFX10-NEXT:    v_add_co_u32 v4, s3, v4, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v3, s3, v6, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v5, s3, v5, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v0, s3, v14, v0
-; GFX10-NEXT:    v_mul_hi_u32 v15, s0, v2
+; GFX10-NEXT:    v_mul_lo_u32 v13, v3, v9
+; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v9
+; GFX10-NEXT:    v_add_co_u32 v11, s3, v15, v12
+; GFX10-NEXT:    v_add3_u32 v5, v8, v7, v5
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v6, s3, v13, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v5, s3, v5, v12
+; GFX10-NEXT:    v_add_co_u32 v7, s3, v11, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v4, s3, v6, v10
+; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v10, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s1, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v12, v7
+; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v9, v3, v9
+; GFX10-NEXT:    v_add_co_u32 v6, s3, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v10, s3, v11, v10
+; GFX10-NEXT:    v_add_co_u32 v0, s6, v6, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v10, s3, v10, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s3
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v0
+; GFX10-NEXT:    v_add_co_u32 v4, s3, v4, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v8, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s3, v0, v15
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v11, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s3
-; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_add_co_u32 v0, s3, v0, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v13, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    v_add_co_u32 v3, s3, v3, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s3
-; GFX10-NEXT:    v_add3_u32 v2, v8, v5, v2
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
-; GFX10-NEXT:    v_add3_u32 v5, v6, v7, v9
-; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
-; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s8, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s8, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v5, s15, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s15, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s14, v1
-; GFX10-NEXT:    v_mul_hi_u32 v17, s14, v4
-; GFX10-NEXT:    v_add3_u32 v6, v6, v9, v7
-; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, s0, v3
-; GFX10-NEXT:    v_mul_lo_u32 v7, s14, v4
-; GFX10-NEXT:    v_mul_lo_u32 v9, s15, v4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v11, s1, v6
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v6, s0, s1, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v4, s15, v4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v3, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v13, v12, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
+; GFX10-NEXT:    v_add_co_u32 v0, s3, v10, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v13, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
+; GFX10-NEXT:    v_add3_u32 v5, v5, v7, v9
+; GFX10-NEXT:    v_mul_lo_u32 v4, s9, v0
+; GFX10-NEXT:    v_add3_u32 v2, v6, v8, v2
+; GFX10-NEXT:    v_mul_lo_u32 v7, s15, v1
+; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX10-NEXT:    v_mul_hi_u32 v5, s8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v9, s14, v1
+; GFX10-NEXT:    v_mul_lo_u32 v11, s14, v3
+; GFX10-NEXT:    v_mul_hi_u32 v1, s15, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, s15, v3
+; GFX10-NEXT:    v_mul_hi_u32 v13, s14, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s15, v3
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v5
+; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s0, v8
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v11
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s1, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v10
-; GFX10-NEXT:    v_add_co_u32 v1, s1, v5, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v4
+; GFX10-NEXT:    v_add_co_u32 v6, s1, v6, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v15, vcc_lo, v5, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v7, v6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v14, v11, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v13, s0, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v7, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v15
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v13, s0
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v9, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v16
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v17, v14, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v1, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v17, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v13, s0, 0, v18, s0
+; GFX10-NEXT:    v_add_co_u32 v12, s0, v13, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v18, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    v_add3_u32 v4, v5, v1, v4
-; GFX10-NEXT:    v_sub_co_u32 v1, s0, v14, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v5, s0, 0, v11, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v18, v13, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v13, s11, v9
-; GFX10-NEXT:    v_mul_lo_u32 v16, s10, v4
-; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v9
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_add3_u32 v3, v9, v1, v3
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v15, s8
+; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v18, v14, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v13, s11, v6
+; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
-; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v14, v1, s0
-; GFX10-NEXT:    v_add3_u32 v10, v13, v16, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v15, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v9, v13, v14, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v15, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v16, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v7, s0, s14, v7
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s1, s15, v10, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v10
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s1, s15, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v9
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v11
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s19, v2
-; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, vcc_lo, s11, v1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v7, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v10, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v11
-; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v12, s0
+; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s11, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v12, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v15, s0, v9, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v4, s0
+; GFX10-NEXT:    v_add_co_u32 v15, s0, v6, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s0
 ; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_sub_co_u32 v6, s0, v13, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_sub_co_u32 v8, s0, v13, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v4, v15, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v11, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v14, v9, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v9, s2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v8, s0
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v2, s2
-; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v9
-; GFX10-NEXT:    v_xor_b32_e32 v7, s1, v10
-; GFX10-NEXT:    v_xor_b32_e32 v9, s12, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v5, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v10, s12, v6
+; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v6, s12, v7
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v9, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v8
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v9, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v10, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
-; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64> addrspace(1)* %out0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 1e95103fd61cb..0cddf3e2c86ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -204,14 +204,8 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
 ; CHECK-NEXT:    s_ashr_i32 s0, s5, 31
 ; CHECK-NEXT:    s_add_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
-; CHECK-NEXT:    s_and_b32 s7, s7, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
 ; CHECK-NEXT:    s_addc_u32 s11, s3, s6
 ; CHECK-NEXT:    s_add_u32 s8, s4, s0
-; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
-; CHECK-NEXT:    s_and_b32 s3, s3, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_addc_u32 s9, s5, s0
 ; CHECK-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
@@ -222,21 +216,18 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_sub_u32 s0, 0, s8
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
-; CHECK-NEXT:    s_and_b32 s1, s1, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
+; CHECK-NEXT:    s_subb_u32 s1, 0, s9
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s1, 0, s9
-; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, s0, v1
+; CHECK-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s0, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -1174,43 +1165,38 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_movk_i32 s10, 0x1000
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1234,7 +1220,6 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -1303,16 +1288,13 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v7
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s8, v7
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
@@ -1323,26 +1305,23 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -1882,43 +1861,38 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s10, 0x12d8fb
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    s_mov_b32 s7, s6
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
@@ -1942,7 +1916,6 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
@@ -2011,16 +1984,13 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v7
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    s_add_u32 s4, s10, 0
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_subrev_i32_e32 v9, vcc, s8, v7
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_add_u32 s4, s10, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    s_addc_u32 s5, 0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
@@ -2031,26 +2001,23 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
 ; GISEL-NEXT:    s_sub_u32 s4, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GISEL-NEXT:    s_and_b32 s5, s5, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GISEL-NEXT:    s_subb_u32 s5, 0, s7
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index f3509f3e80694..38599949d7777 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4203,9 +4203,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s4, s0, s2
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_and_b32 s5, s5, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4229,9 +4226,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s4, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4255,9 +4249,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-LABEL: s_ssubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s4, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4281,15 +4272,12 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-LABEL: s_ssubsat_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s4, s0, s2
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_subb_u32 s5, s1, s3
-; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_xor_b32 s2, s2, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
@@ -4545,9 +4533,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_ssubsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s8, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4558,16 +4543,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_addc_u32 s1, s4, s5
-; GFX6-NEXT:    s_sub_u32 s0, s2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_addc_u32 s1, s4, s5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -4594,9 +4576,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-LABEL: s_ssubsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s8, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4607,16 +4586,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX8-NEXT:    s_addc_u32 s1, s4, s5
-; GFX8-NEXT:    s_sub_u32 s0, s2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_addc_u32 s1, s4, s5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -4643,9 +4619,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-LABEL: s_ssubsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s8, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4656,16 +4629,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    s_brev_b32 s5, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_addc_u32 s1, s4, s5
-; GFX9-NEXT:    s_sub_u32 s0, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_addc_u32 s1, s4, s5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -4692,32 +4662,26 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-LABEL: s_ssubsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s8, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, s[4:5], 0
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
 ; GFX10-NEXT:    s_mov_b32 s11, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX10-NEXT:    s_brev_b32 s10, 1
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
-; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX10-NEXT:    s_xor_b32 s8, s4, s1
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
 ; GFX10-NEXT:    s_addc_u32 s1, s0, s10
 ; GFX10-NEXT:    s_sub_u32 s4, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_subb_u32 s5, s3, s7
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
 ; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
 ; GFX10-NEXT:    s_xor_b32 s2, s3, s2
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
@@ -4736,19 +4700,10 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s8, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX6-NEXT:    s_subb_u32 s9, s1, s5
-; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX6-NEXT:    s_and_b32 s10, s10, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s10, s2, s6
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    s_and_b32 s11, s11, 1
+; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s11, s3, s7
@@ -4761,21 +4716,15 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s1, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s0, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
@@ -4800,18 +4749,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s8, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
-; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX8-NEXT:    s_and_b32 s10, s10, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX8-NEXT:    s_subb_u32 s10, s2, s6
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_and_b32 s11, s11, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -4835,17 +4775,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s2, s0, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
@@ -4870,18 +4804,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_ssubsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s8, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
-; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    s_and_b32 s10, s10, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_subb_u32 s10, s2, s6
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_and_b32 s11, s11, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -4905,17 +4830,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s0, 0
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s2, s0, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
@@ -4940,62 +4859,47 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_ssubsat_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s8, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_subb_u32 s9, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT:    s_and_b32 s10, s10, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_subb_u32 s10, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s11, s11, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
 ; GFX10-NEXT:    s_subb_u32 s11, s3, s7
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s11
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s1, s[4:5], 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    s_and_b32 s0, 1, s12
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
 ; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    s_addc_u32 s1, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s11
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s8
-; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s10
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s3, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
@@ -5553,19 +5457,10 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_ssubsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s16, s0, s8
-; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_and_b32 s17, s17, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_subb_u32 s17, s1, s9
-; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_and_b32 s18, s18, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s18, s2, s10
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    s_and_b32 s19, s19, 1
+; GFX6-NEXT:    s_subb_u32 s17, s1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s19, s3, s11
@@ -5578,51 +5473,36 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX6-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s1, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s2, s0, 0
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    s_brev_b32 s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_addc_u32 s2, s0, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_addc_u32 s3, s0, s8
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_subb_u32 s1, s5, s13
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s17
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s19
-; GFX6-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX6-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX6-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s3, s7, s15
@@ -5635,21 +5515,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_mov_b32 s5, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_addc_u32 s5, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX6-NEXT:    s_and_b32 s6, s6, 1
-; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s6, s4, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s7, s7, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_addc_u32 s7, s4, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
@@ -5678,18 +5552,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_ssubsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s16, s0, s8
-; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_and_b32 s17, s17, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_subb_u32 s17, s1, s9
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_subb_u32 s18, s2, s10
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -5713,46 +5578,31 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s0, 0
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_brev_b32 s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_addc_u32 s2, s0, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_addc_u32 s3, s0, s8
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_subb_u32 s1, s5, s13
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s17
-; GFX8-NEXT:    s_subb_u32 s2, s6, s14
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s19
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -5776,17 +5626,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    s_mov_b32 s5, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_addc_u32 s5, s4, 0
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    s_and_b32 s6, s6, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_addc_u32 s6, s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s7, s7, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_addc_u32 s7, s4, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
@@ -5815,18 +5659,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_ssubsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s16, s0, s8
-; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_and_b32 s17, s17, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_subb_u32 s17, s1, s9
-; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_and_b32 s18, s18, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_subb_u32 s18, s2, s10
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_and_b32 s19, s19, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -5850,46 +5685,31 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s0, 0
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s0, 0
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_brev_b32 s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_addc_u32 s2, s0, 0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_addc_u32 s3, s0, s8
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_subb_u32 s1, s5, s13
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s17
-; GFX9-NEXT:    s_subb_u32 s2, s6, s14
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
@@ -5913,17 +5733,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_addc_u32 s5, s4, 0
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    s_and_b32 s6, s6, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_addc_u32 s6, s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s7, s7, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_addc_u32 s7, s4, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
@@ -5952,120 +5766,90 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-LABEL: s_ssubsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s16, s0, s8
-; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_and_b32 s17, s17, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_subb_u32 s17, s1, s9
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
-; GFX10-NEXT:    s_and_b32 s18, s18, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_subb_u32 s18, s2, s10
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s19, s19, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
 ; GFX10-NEXT:    s_subb_u32 s19, s3, s11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT:    s_brev_b32 s21, 1
 ; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s20
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_mov_b32 s10, 0
+; GFX10-NEXT:    s_mov_b32 s20, 0
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_ashr_i32 s0, s19, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    s_brev_b32 s11, 1
 ; GFX10-NEXT:    s_addc_u32 s1, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, s21
+; GFX10-NEXT:    s_sub_u32 s8, s4, s12
+; GFX10-NEXT:    s_subb_u32 s9, s5, s13
+; GFX10-NEXT:    s_subb_u32 s10, s6, s14
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[8:9], s[4:5]
+; GFX10-NEXT:    s_subb_u32 s11, s7, s15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[6:7]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s17
-; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s19
+; GFX10-NEXT:    v_mov_b32_e32 v7, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[10:11], s[6:7]
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s6, s[12:13], 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_addc_u32 s2, s0, 0
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s16
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_addc_u32 s3, s0, s11
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT:    s_sub_u32 s0, s4, s12
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_subb_u32 s1, s5, s13
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s4, s[12:13], 0
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT:    s_subb_u32 s8, s6, s14
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, s[14:15], 0
-; GFX10-NEXT:    s_subb_u32 s9, s7, s15
-; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4
-; GFX10-NEXT:    s_and_b32 s2, 1, s2
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10-NEXT:    s_and_b32 s4, 1, s16
 ; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX10-NEXT:    s_and_b32 s3, 1, s3
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
-; GFX10-NEXT:    s_addc_u32 s3, s2, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, s9
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    s_addc_u32 s4, s2, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, s[14:15], 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    v_mov_b32_e32 v7, s8
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    s_addc_u32 s1, s2, s11
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, s1, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
-; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
+; GFX10-NEXT:    v_mov_b32_e32 v5, s19
+; GFX10-NEXT:    v_mov_b32_e32 v6, s9
+; GFX10-NEXT:    v_xor_b32_e32 v3, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s1, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s3, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, s8
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s10
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, s1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, s3, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v3
 ; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index ea9547e063025..ec59cb495898d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -457,7 +457,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_sub_u32 s0, s0, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX7-NEXT:    s_and_b32 s1, s1, 1
 ; GFX7-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -465,7 +464,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -473,7 +471,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
   %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@@ -487,13 +484,10 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_usubo_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_and_b32 s5, s5, 1
 ; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX7-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX7-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX7-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
@@ -505,13 +499,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
 ;
 ; GFX8-LABEL: s_usubo_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX8-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX8-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
@@ -523,13 +514,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
 ;
 ; GFX9-LABEL: s_usubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
@@ -553,8 +541,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX7-NEXT:    s_and_b32 s2, s2, 1
-; GFX7-NEXT:    s_and_b32 s3, s3, 1
 ; GFX7-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX7-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -565,8 +551,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -577,8 +561,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX9-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -728,9 +710,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_ssubo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_sub_u32 s4, s0, s2
-; GFX7-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX7-NEXT:    s_and_b32 s5, s5, 1
-; GFX7-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
@@ -748,9 +727,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-LABEL: s_ssubo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s4, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -768,9 +744,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-LABEL: s_ssubo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s4, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index b07bdeeabd7ce..24284351fc911 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2591,9 +2591,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
-; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX6-NEXT:    s_and_b32 s4, s4, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -2609,9 +2606,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-LABEL: s_uaddsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX8-NEXT:    s_and_b32 s4, s4, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -2627,9 +2621,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-LABEL: s_uaddsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX9-NEXT:    s_and_b32 s4, s4, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -2645,9 +2636,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-LABEL: s_uaddsat_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s0, s0, s2
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, s2
@@ -2816,20 +2804,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_uaddsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    s_add_u32 s0, s2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX6-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -2848,20 +2830,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-LABEL: s_uaddsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -2880,20 +2856,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-LABEL: s_uaddsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -2912,23 +2882,17 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-LABEL: s_uaddsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s0, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX10-NEXT:    s_add_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s7
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, s4
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, -1, s4
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, s2, -1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s3, -1, s5
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2940,19 +2904,10 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_addc_u32 s1, s1, s5
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_addc_u32 s2, s2, s6
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX6-NEXT:    s_addc_u32 s2, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s7
@@ -2981,18 +2936,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_uaddsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX8-NEXT:    s_addc_u32 s2, s2, s6
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX8-NEXT:    s_addc_u32 s2, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -3025,18 +2971,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_uaddsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX9-NEXT:    s_addc_u32 s2, s2, s6
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
@@ -3069,26 +3006,17 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_uaddsat_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s0, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s4
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[2:3], s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    s_and_b32 s4, 1, s8
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -3450,19 +3378,10 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_uaddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s8
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_and_b32 s16, s16, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_addc_u32 s1, s1, s9
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_and_b32 s16, s16, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_addc_u32 s2, s2, s10
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    s_and_b32 s16, s16, 1
+; GFX6-NEXT:    s_addc_u32 s1, s1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_addc_u32 s2, s2, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s11
@@ -3472,30 +3391,21 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    s_add_u32 s0, s4, s12
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s1, s5, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s2, s6, s14
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX6-NEXT:    s_add_u32 s0, s4, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s12
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX6-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s7, s15
@@ -3528,18 +3438,9 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_uaddsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s8
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_and_b32 s16, s16, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s9
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_and_b32 s16, s16, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_addc_u32 s2, s2, s10
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_and_b32 s16, s16, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_addc_u32 s2, s2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
@@ -3552,28 +3453,19 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s8, 1, s10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    s_add_u32 s0, s4, s12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s1, s5, s13
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s6, s14
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_add_u32 s0, s4, s12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s12
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
@@ -3612,18 +3504,9 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_uaddsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s8
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_and_b32 s16, s16, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s9
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_and_b32 s16, s16, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_addc_u32 s2, s2, s10
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_and_b32 s16, s16, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s10
@@ -3636,28 +3519,19 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s8, 1, s10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_add_u32 s0, s4, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s1, s5, s13
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s6, s14
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_add_u32 s0, s4, s12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s12
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
@@ -3696,69 +3570,51 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-LABEL: s_uaddsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s0, s0, s8
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_and_b32 s16, s16, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s1, s9
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
-; GFX10-NEXT:    s_and_b32 s16, s16, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_addc_u32 s2, s2, s10
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s8
-; GFX10-NEXT:    s_and_b32 s16, s16, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s10, s[2:3], s[10:11]
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s8
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s8, s[2:3], s[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s8
 ; GFX10-NEXT:    s_and_b32 s8, 1, s16
 ; GFX10-NEXT:    s_add_u32 s4, s4, s12
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s10
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s5, s13
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s9, s[4:5], s[12:13]
 ; GFX10-NEXT:    s_addc_u32 s6, s6, s14
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s9
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s7, s7, s15
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s9
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    s_and_b32 s8, 1, s8
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s0, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s1, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s2, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, s3, -1, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s4, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s5, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s6, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s7, -1, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, s2, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, s3, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, -1, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, s4, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, s5, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, s6, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, s7, -1, s0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 2ba189ce7b965..3a7625ea9e362 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -194,14 +194,11 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
-; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v2
-; CHECK-NEXT:    s_and_b32 s5, s5, 1
+; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 106ca06a30191..12423fc70269d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -117,13 +117,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX8-NEXT:    s_sub_u32 s0, 0, s10
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -140,19 +137,19 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -269,13 +266,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s10
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -296,16 +290,16 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -412,11 +406,8 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX10-NEXT:    s_sub_u32 s0, 0, s10
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, 0, s11
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1026,13 +1017,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s12
 ; GFX8-NEXT:    s_sub_u32 s0, 0, s12
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, 0, s13
+; GFX8-NEXT:    s_sub_u32 s2, 0, s14
+; GFX8-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1040,7 +1030,6 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s2, 0, s14
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
@@ -1050,19 +1039,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -1171,23 +1160,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_trunc_f32_e32 v6, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v7, 0xcf800000, v6
 ; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s3, 0, s15
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s3, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s2, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v10, s2, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v10
 ; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v3, v9
 ; GFX8-NEXT:    v_mul_hi_u32 v9, v6, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
@@ -1318,13 +1303,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s13
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s12
 ; GFX9-NEXT:    s_sub_u32 s0, 0, s12
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s1, 0, s13
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s15
+; GFX9-NEXT:    s_sub_u32 s2, 0, s14
+; GFX9-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1332,14 +1317,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s15
-; GFX9-NEXT:    s_sub_u32 s2, 0, s14
+; GFX9-NEXT:    v_mul_f32_e32 v14, 0x4f800000, v14
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v14, 0x4f800000, v14
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -1349,16 +1332,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1455,20 +1438,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v5, v13, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s3, 0, s15
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v13, s3, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v14, s2, v12
 ; GFX9-NEXT:    v_mul_hi_u32 v16, s2, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v17, s2, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v13, v14, v16
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v12, v17
 ; GFX9-NEXT:    v_mul_lo_u32 v13, v5, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v17
 ; GFX9-NEXT:    v_mul_hi_u32 v14, v12, v17
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v13
@@ -1600,19 +1579,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    s_sub_u32 s0, 0, s12
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    s_subb_u32 s1, 0, s13
+; GFX10-NEXT:    s_sub_u32 s2, 0, s14
+; GFX10-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_sub_u32 s2, 0, s14
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
@@ -1690,174 +1663,174 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX10-NEXT:    v_add3_u32 v5, v6, v5, v7
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v11, v10
-; GFX10-NEXT:    v_mul_lo_u32 v12, v2, v4
+; GFX10-NEXT:    v_mul_lo_u32 v13, v2, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v13, v0, v4
+; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v11, v2, v5
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v3, v8
-; GFX10-NEXT:    v_mul_lo_u32 v15, v1, v9
+; GFX10-NEXT:    v_mul_lo_u32 v16, v1, v9
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v12, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v17, v3, v9
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v13, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v11, v4
-; GFX10-NEXT:    v_mul_lo_u32 v16, v3, v9
+; GFX10-NEXT:    v_mul_hi_u32 v15, v0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v15
-; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v10, s0, v10, v13
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v8, s0, v17, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v10, s0, v10, v14
+; GFX10-NEXT:    v_mul_hi_u32 v18, v1, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v8, s0, v16, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v14
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v12, v10
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v10
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v14
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v10
+; GFX10-NEXT:    v_add_co_u32 v8, s0, v8, v18
+; GFX10-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v17, v1, v9
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v15, v6
-; GFX10-NEXT:    v_add3_u32 v5, v7, v10, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v16, v6
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v11, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
 ; GFX10-NEXT:    v_mul_hi_u32 v9, v3, v9
-; GFX10-NEXT:    v_mul_hi_u32 v10, s9, v0
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0
-; GFX10-NEXT:    v_add_co_u32 v8, s0, v8, v17
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v8, v6
-; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s9, v2
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v8
-; GFX10-NEXT:    v_add3_u32 v5, v7, v5, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v8, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v17, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v6, v0
-; GFX10-NEXT:    v_add_co_u32 v9, s0, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add3_u32 v5, v11, v7, v5
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add3_u32 v4, v10, v8, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v6
+; GFX10-NEXT:    v_mul_lo_u32 v5, s9, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v4, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v2
+; GFX10-NEXT:    v_mul_hi_u32 v10, s8, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v9, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v7, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT:    v_mul_lo_u32 v8, s10, v3
-; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v0
-; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v0
-; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
-; GFX10-NEXT:    v_mul_hi_u32 v7, s10, v1
+; GFX10-NEXT:    v_mul_hi_u32 v8, s10, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v5, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v9, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v10, s10, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v5, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s11, v3
+; GFX10-NEXT:    v_mul_lo_u32 v5, s11, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v9, v7
+; GFX10-NEXT:    v_mul_hi_u32 v11, s10, v3
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v5, v1
+; GFX10-NEXT:    v_add3_u32 v2, v7, v4, v2
+; GFX10-NEXT:    v_mul_lo_u32 v5, s13, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s12, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v13, s12, v0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s12, v2
+; GFX10-NEXT:    v_mul_lo_u32 v10, s12, v2
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s10, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v4, v1
-; GFX10-NEXT:    v_add3_u32 v9, v9, v11, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, s8, v13
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s9, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s0, s9, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s13, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v8, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v10, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v9
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v13, v11, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
+; GFX10-NEXT:    v_add3_u32 v5, v5, v10, v7
+; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s8, v13
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v9, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s9, v5
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s9, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v7
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s13, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v11, vcc_lo, v7, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v10, v9, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v1, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, 1
+; GFX10-NEXT:    v_add_co_u32 v15, s0, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v15
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v13
 ; GFX10-NEXT:    v_add3_u32 v3, v4, v1, v3
 ; GFX10-NEXT:    v_mul_hi_u32 v18, s14, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s0
-; GFX10-NEXT:    v_mul_lo_u32 v13, s15, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v14, v10, s0
+; GFX10-NEXT:    v_mul_lo_u32 v14, s15, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v17, s14, v3
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v5, 1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v15, 1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v16, s0
-; GFX10-NEXT:    v_sub_co_u32 v19, s0, v14, s12
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v5, s14, v6
+; GFX10-NEXT:    v_sub_co_u32 v19, s0, v11, s12
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v15, s14, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v13, v13, v17, v18
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-NEXT:    v_add3_u32 v14, v14, v17, v18
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s11, v13
-; GFX10-NEXT:    v_sub_co_u32 v11, s0, s10, v5
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s11, v13, s0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s11, v14
+; GFX10-NEXT:    v_sub_co_u32 v9, s0, s10, v15
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v15, s1, s11, v14, s0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v11
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v8
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
-; GFX10-NEXT:    v_sub_co_u32 v13, s0, v11, s14
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v19, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v16
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v10
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v9
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v11, v19, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT:    v_sub_co_u32 v14, s0, v9, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v15
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v16
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s1
-; GFX10-NEXT:    v_add_co_u32 v15, s1, v6, 1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s1
+; GFX10-NEXT:    v_add_co_u32 v13, s1, v6, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s1
-; GFX10-NEXT:    v_add_co_u32 v10, s1, v15, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s1
+; GFX10-NEXT:    v_add_co_u32 v11, s1, v13, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v13, s14
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v10
+; GFX10-NEXT:    v_sub_co_u32 v10, s1, v14, s14
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v15, v10, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v17, v18, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v14, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v10, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v11, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v13, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v13, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v17, v18, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v14, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v16, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v11, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v13, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v7, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v10, s1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v12, v[0:3], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx4 v12, v[4:7], s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index ae9aeb99b258d..27de0ccd4b23a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -191,14 +191,11 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
-; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v2
-; CHECK-NEXT:    s_and_b32 s5, s5, 1
+; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
@@ -1103,226 +1100,220 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s8, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s8
 ; GISEL-NEXT:    s_sub_u32 s6, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    v_madmk_f32 v6, v4, 0x4f800000, v5
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v4
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    v_madmk_f32 v5, v4, 0x4f800000, v6
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
 ; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GISEL-NEXT:    s_sub_u32 s9, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    s_subb_u32 s10, 0, 0
+; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
+; GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v9
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s10, 0, 0
-; GISEL-NEXT:    v_mul_lo_u32 v11, s9, v9
-; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
-; GISEL-NEXT:    v_mul_lo_u32 v12, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, s9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, s10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v17, s9, v7
-; GISEL-NEXT:    v_mov_b32_e32 v18, s4
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v12
-; GISEL-NEXT:    v_mul_hi_u32 v19, v6, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v15
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v9
+; GISEL-NEXT:    v_mul_lo_u32 v13, s6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, s7, v7
+; GISEL-NEXT:    v_mul_hi_u32 v15, s6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, s10, v6
+; GISEL-NEXT:    v_mul_hi_u32 v18, s9, v6
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
+; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v13
+; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v16
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v16
+; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v16
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v11
+; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v11
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v12
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, s10, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, s6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v19, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT:    v_mul_lo_u32 v17, s9, v9
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v6, v15
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v10
-; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v10
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v19
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v15
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v16, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v15
+; GISEL-NEXT:    v_mul_hi_u32 v17, v6, v15
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v16
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, s6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v13, s6, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, s9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, s10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, s9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, s6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v19, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v9
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v12
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v12
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v19
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v17, s[4:5], v17, v19
-; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v15, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT:    v_mul_hi_u32 v16, v7, v14
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
-; GISEL-NEXT:    v_mov_b32_e32 v19, s11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v14
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, v0, v8
-; GISEL-NEXT:    v_mul_lo_u32 v15, v1, v8
-; GISEL-NEXT:    v_mul_hi_u32 v16, v0, v8
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v8
+; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v2, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v9
+; GISEL-NEXT:    v_mul_lo_u32 v18, v2, v9
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v15, v6
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v16
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v16, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v16
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, s8, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, s8, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v13, s8, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, s8, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, s8, v6
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v7
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v13
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], v3, v7, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v14
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[6:7], v3, v6
 ; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[6:7]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v18, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
 ; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 805afaad6d3e7..3a1566b63e501 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2460,11 +2460,8 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sub_u32 s4, s0, s2
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_and_b32 s5, s5, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
@@ -2478,11 +2475,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ;
 ; GFX8-LABEL: s_usubsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sub_u32 s4, s0, s2
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX8-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
@@ -2496,11 +2490,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ;
 ; GFX9-LABEL: s_usubsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_sub_u32 s4, s0, s2
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX9-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
@@ -2515,10 +2506,7 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-LABEL: s_usubsat_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s4, s0, s2
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s1, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s4, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s0
@@ -2685,21 +2673,15 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sub_u32 s8, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    s_subb_u32 s9, s1, s5
+; GFX6-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT:    s_sub_u32 s0, s2, s6
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
@@ -2717,21 +2699,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ;
 ; GFX8-LABEL: s_usubsat_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sub_u32 s8, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    s_subb_u32 s9, s1, s5
+; GFX8-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT:    s_sub_u32 s0, s2, s6
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
@@ -2749,21 +2725,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ;
 ; GFX9-LABEL: s_usubsat_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_sub_u32 s8, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    s_subb_u32 s9, s1, s5
+; GFX9-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    s_sub_u32 s0, s2, s6
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
@@ -2782,23 +2752,17 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-LABEL: s_usubsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s8, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX10-NEXT:    s_subb_u32 s9, s1, s5
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[0:1], s[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
+; GFX10-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX10-NEXT:    s_sub_u32 s0, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s9, 0, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s4
 ; GFX10-NEXT:    s_subb_u32 s1, s3, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, s0, 0, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, s1, 0, s2
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2809,28 +2773,19 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sub_u32 s8, s0, s4
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s7
-; GFX6-NEXT:    s_and_b32 s10, s10, 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s10, s2, s6
+; GFX6-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_and_b32 s11, s11, 1
+; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
@@ -2851,18 +2806,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_usubsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s8, s0, s4
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
-; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX8-NEXT:    s_and_b32 s10, s10, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX8-NEXT:    s_subb_u32 s10, s2, s6
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_and_b32 s11, s11, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -2895,18 +2841,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_usubsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s8, s0, s4
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
-; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    s_and_b32 s10, s10, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_subb_u32 s10, s2, s6
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_and_b32 s11, s11, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_subb_u32 s10, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
@@ -2939,33 +2876,24 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_usubsat_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s8, s0, s4
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[4:5]
-; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_subb_u32 s9, s1, s5
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s10, s10, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_subb_u32 s10, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_and_b32 s11, s11, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10-NEXT:    s_subb_u32 s1, s3, s7
+; GFX10-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[2:3], s[6:7]
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    s_and_b32 s0, 1, s12
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s9, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, s10, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s1, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, s11, 0, vcc_lo
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
@@ -3319,61 +3247,43 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sub_u32 s16, s0, s8
-; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_and_b32 s17, s17, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_subb_u32 s17, s1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    s_and_b32 s18, s18, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s11
-; GFX6-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_and_b32 s19, s19, 1
+; GFX6-NEXT:    s_sub_u32 s16, s0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_subb_u32 s19, s3, s11
+; GFX6-NEXT:    s_subb_u32 s17, s1, s9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_sub_u32 s0, s4, s12
+; GFX6-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX6-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v0, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX6-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s15
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_subb_u32 s2, s6, s14
+; GFX6-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX6-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
@@ -3398,18 +3308,9 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_usubsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s16, s0, s8
-; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_and_b32 s17, s17, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_subb_u32 s17, s1, s9
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_and_b32 s18, s18, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_subb_u32 s18, s2, s10
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
@@ -3422,28 +3323,19 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s0, 1, s10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_sub_u32 s0, s4, s12
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_subb_u32 s1, s5, s13
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s19
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v1, 0, vcc
@@ -3482,18 +3374,9 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_usubsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s16, s0, s8
-; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_and_b32 s17, s17, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_subb_u32 s17, s1, s9
-; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_and_b32 s18, s18, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_subb_u32 s18, s2, s10
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_and_b32 s19, s19, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s10
@@ -3506,28 +3389,19 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s0, 1, s10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_sub_u32 s0, s4, s12
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s1, s5, s13
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_sub_u32 s0, s4, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v1, 0, vcc
@@ -3566,69 +3440,51 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-LABEL: s_usubsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s16, s0, s8
-; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
-; GFX10-NEXT:    s_and_b32 s17, s17, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_subb_u32 s17, s1, s9
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s18, s18, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_subb_u32 s18, s2, s10
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_and_b32 s19, s19, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX10-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[2:3], s[10:11]
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s20
-; GFX10-NEXT:    s_sub_u32 s8, s4, s12
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_sub_u32 s2, s4, s12
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[4:5], s[12:13]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_subb_u32 s3, s5, s13
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[4:5], s[12:13]
-; GFX10-NEXT:    s_subb_u32 s10, s6, s14
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s1, s[6:7], s[14:15]
-; GFX10-NEXT:    s_subb_u32 s9, s7, s15
+; GFX10-NEXT:    s_subb_u32 s1, s5, s13
+; GFX10-NEXT:    s_subb_u32 s8, s6, s14
+; GFX10-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[14:15]
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s16, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s17, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s18, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, s19, 0, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s8, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, s10, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, s9, 0, vcc_lo
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX10-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, s16, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, s18, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, s19, 0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s17, 0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, s2, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, s1, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, s8, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, s3, 0, s0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index a03df7c7cb7b7..89b3900dc2880 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -190,9 +190,6 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
 ; GCN-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
 ; GCN-NEXT:    s_not_b64 s[4:5], s[2:3]
 ; GCN-NEXT:    s_add_u32 s2, s2, s0
-; GCN-NEXT:    s_cselect_b32 s0, 1, 0
-; GCN-NEXT:    s_and_b32 s0, s0, 1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s3, s3, s1
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
@@ -203,11 +200,8 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
 ; GFX10-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_not_b64 s[4:5], s[2:3]
 ; GFX10-NEXT:    s_add_u32 s2, s2, s0
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s1
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    ; return to shader part epilog
   %xor = xor i64 %a, %b

diff  --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index ab8648f198853..44e8b5704b04e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1616,9 +1616,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1635,9 +1632,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1710,9 +1704,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1729,9 +1720,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1804,9 +1792,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1823,9 +1808,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1902,9 +1884,6 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
 ; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
 ; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1922,9 +1901,6 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
 ; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
 ; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index ea9b5628b02fd..bd0c2b30eb5de 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -283,14 +283,8 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
 ; GISEL-NEXT:    s_add_u32 s2, s4, s6
-; GISEL-NEXT:    s_cselect_b32 s3, 1, 0
-; GISEL-NEXT:    s_and_b32 s3, s3, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GISEL-NEXT:    s_addc_u32 s3, s5, s7
 ; GISEL-NEXT:    s_add_u32 s0, s2, s0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GISEL-NEXT:    s_addc_u32 s1, s3, s1
 ; GISEL-NEXT:    ; return to shader part epilog
   %and = and i64 %b, 63
@@ -322,14 +316,8 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
 ; GISEL-NEXT:    s_add_u32 s2, s4, s6
-; GISEL-NEXT:    s_cselect_b32 s3, 1, 0
-; GISEL-NEXT:    s_and_b32 s3, s3, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GISEL-NEXT:    s_addc_u32 s3, s5, s7
 ; GISEL-NEXT:    s_add_u32 s0, s2, s0
-; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GISEL-NEXT:    s_and_b32 s2, s2, 1
-; GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GISEL-NEXT:    s_addc_u32 s1, s3, s1
 ; GISEL-NEXT:    ; return to shader part epilog
   %and = and i64 %b, 255

diff  --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index 7a2fc0ff1d51d..7eaeb32e461ba 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -1972,3 +1972,24 @@ TEST_F(AMDGPUGISelMITest, TestKnownBitsAssertAlign) {
   CheckBits(30, Copies.size() - 2);
   CheckBits(5, Copies.size() - 1);
 }
+
+TEST_F(AArch64GISelMITest, TestKnownBitsUADDO) {
+  StringRef MIRString = R"(
+   %ptr:_(p0) = G_IMPLICIT_DEF
+   %ld0:_(s32) = G_LOAD %ptr(p0) :: (load (s16))
+   %ld1:_(s32) = G_LOAD %ptr(p0) :: (load (s16))
+
+   %add:_(s32), %overflow:_(s32) = G_UADDO %ld0, %ld1
+   %copy_overflow:_(s32) = COPY %overflow
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    return;
+
+  Register CopyOverflow = Copies[Copies.size() - 1];
+  GISelKnownBits Info(*MF);
+  KnownBits Res = Info.getKnownBits(CopyOverflow);
+  EXPECT_EQ(0u, Res.One.getZExtValue());
+  EXPECT_EQ(31u, Res.Zero.countLeadingOnes());
+}

diff  --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
index dc915d5f5e216..bddeb1342f0d9 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
@@ -1527,3 +1527,24 @@ TEST_F(AArch64GISelMITest, TestKnownBitsVectorAssertZext) {
   EXPECT_EQ(0u, Res.One.getZExtValue());
   EXPECT_EQ(0xFFFFFFFFFFFFFFF8u, Res.Zero.getZExtValue());
 }
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsUAddoOverflow) {
+  StringRef MIRString = R"(
+   %copy_x0:_(s64) = COPY $x0
+   %copy_x1:_(s64) = COPY $x1
+   %x0_x1:_(<2 x s64>) = G_BUILD_VECTOR %copy_x0, %copy_x1
+   %uaddo:_(<2 x s64>), %overflow:_(<2 x s32>) = G_UADDO %x0_x1, %x0_x1
+   %result:_(<2 x s32>) = COPY %overflow
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    return;
+
+  Register CopyOverflow = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+
+  // Assert sign-extension from vector boolean
+  EXPECT_EQ(32u, Info.computeNumSignBits(CopyOverflow));
+}


        


More information about the llvm-commits mailing list