[llvm] 1416744 - GlobalISel: Implement computeKnownBits for overflow bool results
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 11 16:44:46 PDT 2022
Author: Matt Arsenault
Date: 2022-04-11T19:43:37-04:00
New Revision: 1416744f8405db03096bc240a8ec9de176a71569
URL: https://github.com/llvm/llvm-project/commit/1416744f8405db03096bc240a8ec9de176a71569
DIFF: https://github.com/llvm/llvm-project/commit/1416744f8405db03096bc240a8ec9de176a71569.diff
LOG: GlobalISel: Implement computeKnownBits for overflow bool results
Added:
Modified:
llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
llvm/test/CodeGen/AMDGPU/bfi_int.ll
llvm/test/CodeGen/AMDGPU/constrained-shift.ll
llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 64c2f0d5f8e49..4f03af0fce82d 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -567,6 +567,26 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
Known = KnownBits::ashr(KnownBits::shl(Known, ShiftKnown), ShiftKnown);
break;
}
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_UADDE:
+ case TargetOpcode::G_SADDO:
+ case TargetOpcode::G_SADDE:
+ case TargetOpcode::G_USUBO:
+ case TargetOpcode::G_USUBE:
+ case TargetOpcode::G_SSUBO:
+ case TargetOpcode::G_SSUBE:
+ case TargetOpcode::G_UMULO:
+ case TargetOpcode::G_SMULO: {
+ if (MI.getOperand(1).getReg() == R) {
+ // If we know the result of a compare has the top bits zero, use this
+ // info.
+ if (TL.getBooleanContents(DstTy.isVector(), false) ==
+ TargetLowering::ZeroOrOneBooleanContent &&
+ BitWidth > 1)
+ Known.Zero.setBitsFrom(1);
+ }
+ break;
+ }
}
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
@@ -673,6 +693,27 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
MI.getOperand(3).getReg(), DemandedElts,
Depth + 1);
}
+ case TargetOpcode::G_SADDO:
+ case TargetOpcode::G_SADDE:
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_UADDE:
+ case TargetOpcode::G_SSUBO:
+ case TargetOpcode::G_SSUBE:
+ case TargetOpcode::G_USUBO:
+ case TargetOpcode::G_USUBE:
+ case TargetOpcode::G_SMULO:
+ case TargetOpcode::G_UMULO: {
+ // If compares returns 0/-1, all bits are sign bits.
+ // We know that we have an integer-based boolean since these operations
+ // are only available for integer.
+ if (MI.getOperand(1).getReg() == R) {
+ if (TL.getBooleanContents(DstTy.isVector(), false) ==
+ TargetLowering::ZeroOrNegativeOneBooleanContent)
+ return TyBits;
+ }
+
+ break;
+ }
case TargetOpcode::G_INTRINSIC:
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
default: {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index 352b811b7845b..646705337aabc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -457,7 +457,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_add_u32 s0, s0, s1
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
-; GFX7-NEXT: s_and_b32 s1, s1, 1
; GFX7-NEXT: s_add_i32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
@@ -465,7 +464,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s1
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
@@ -473,7 +471,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s1
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
%uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
@@ -488,9 +485,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_uaddo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_cselect_b32 s4, 1, 0
-; GFX7-NEXT: s_and_b32 s4, s4, 1
-; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: s_addc_u32 s1, s1, s3
; GFX7-NEXT: v_mov_b32_e32 v1, s3
@@ -506,9 +500,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX8-LABEL: s_uaddo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_cselect_b32 s4, 1, 0
-; GFX8-NEXT: s_and_b32 s4, s4, 1
-; GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -524,9 +515,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX9-LABEL: s_uaddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: s_cselect_b32 s4, 1, 0
-; GFX9-NEXT: s_and_b32 s4, s4, 1
-; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_addc_u32 s1, s1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s3
@@ -553,8 +541,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX7-NEXT: s_cselect_b32 s2, 1, 0
; GFX7-NEXT: s_add_u32 s1, s1, s3
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
-; GFX7-NEXT: s_and_b32 s2, s2, 1
-; GFX7-NEXT: s_and_b32 s3, s3, 1
; GFX7-NEXT: s_add_i32 s0, s0, s2
; GFX7-NEXT: s_add_i32 s1, s1, s3
; GFX7-NEXT: ; return to shader part epilog
@@ -565,8 +551,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
; GFX8-NEXT: s_add_u32 s1, s1, s3
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s3, s3, 1
; GFX8-NEXT: s_add_i32 s0, s0, s2
; GFX8-NEXT: s_add_i32 s1, s1, s3
; GFX8-NEXT: ; return to shader part epilog
@@ -577,8 +561,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-NEXT: s_add_u32 s1, s1, s3
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_and_b32 s3, s3, 1
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: ; return to shader part epilog
@@ -728,9 +710,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_saddo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_add_u32 s4, s0, s2
-; GFX7-NEXT: s_cselect_b32 s5, 1, 0
-; GFX7-NEXT: s_and_b32 s5, s5, 1
-; GFX7-NEXT: s_cmp_lg_u32 s5, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_addc_u32 s5, s1, s3
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -748,9 +727,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX8-LABEL: s_saddo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s2
-; GFX8-NEXT: s_cselect_b32 s5, 1, 0
-; GFX8-NEXT: s_and_b32 s5, s5, 1
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s5, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -768,9 +744,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s2
-; GFX9-NEXT: s_cselect_b32 s5, 1, 0
-; GFX9-NEXT: s_and_b32 s5, s5, 1
-; GFX9-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s5, s1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index cca8a9ee86fde..f07d2b83dbf31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -31,9 +31,6 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
; GFX: ; %bb.0:
; GFX-NEXT: s_ashr_i32 s2, s1, 31
; GFX-NEXT: s_add_u32 s0, s0, s2
-; GFX-NEXT: s_cselect_b32 s4, 1, 0
-; GFX-NEXT: s_and_b32 s4, s4, 1
-; GFX-NEXT: s_cmp_lg_u32 s4, 0
; GFX-NEXT: s_mov_b32 s3, s2
; GFX-NEXT: s_addc_u32 s1, s1, s2
; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 97858b3dae67c..ccf6e6be39be3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -447,7 +447,6 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-NEXT: s_mul_i32 s5, s0, s5
; GFX7-NEXT: s_add_i32 s0, s2, s7
; GFX7-NEXT: s_add_i32 s0, s0, s5
-; GFX7-NEXT: s_and_b32 s8, s8, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1
@@ -477,7 +476,6 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX8-NEXT: s_mul_i32 s5, s0, s5
; GFX8-NEXT: s_add_i32 s0, s2, s7
; GFX8-NEXT: s_add_i32 s0, s0, s5
-; GFX8-NEXT: s_and_b32 s8, s8, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1
@@ -492,13 +490,11 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s7, s1, s3
; GFX9-NEXT: s_mul_i32 s8, s0, s4
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3
; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX9-NEXT: s_and_b32 s8, s8, 1
; GFX9-NEXT: s_add_u32 s7, s7, s9
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
; GFX9-NEXT: s_add_i32 s8, s8, s9
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: s_mul_i32 s9, s1, s4
@@ -521,17 +517,15 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX10-NEXT: s_mul_i32 s7, s0, s4
; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3
; GFX10-NEXT: s_add_u32 s6, s6, s7
-; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_mul_i32 s2, s2, s3
-; GFX10-NEXT: s_and_b32 s7, s7, 1
; GFX10-NEXT: s_mul_i32 s9, s1, s4
+; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_add_u32 s6, s6, s8
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_mul_i32 s5, s0, s5
; GFX10-NEXT: s_add_i32 s2, s2, s9
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX10-NEXT: s_add_i32 s2, s2, s5
-; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s7, s7, s8
@@ -656,24 +650,21 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-NEXT: s_mul_i32 s9, s1, s4
; GFX7-NEXT: s_mul_i32 s10, s0, s5
; GFX7-NEXT: s_add_u32 s9, s9, s10
-; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0
-; GFX7-NEXT: s_and_b32 s10, s10, 1
+; GFX7-NEXT: v_mov_b32_e32 v2, s1
+; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1
; GFX7-NEXT: s_mul_i32 s9, s2, s4
; GFX7-NEXT: s_mul_i32 s10, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_add_u32 s9, s9, s10
-; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
-; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: s_mul_i32 s11, s0, s6
-; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: s_add_u32 s9, s9, s11
+; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
+; GFX7-NEXT: s_cselect_b32 s10, 1, 0
+; GFX7-NEXT: s_add_u32 s9, s9, s11
; GFX7-NEXT: s_cselect_b32 s11, 1, 0
-; GFX7-NEXT: s_and_b32 s11, s11, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2
; GFX7-NEXT: s_add_i32 s10, s10, s11
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -714,24 +705,21 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX8-NEXT: s_mul_i32 s9, s1, s4
; GFX8-NEXT: s_mul_i32 s10, s0, s5
; GFX8-NEXT: s_add_u32 s9, s9, s10
-; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0
-; GFX8-NEXT: s_and_b32 s10, s10, 1
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1
; GFX8-NEXT: s_mul_i32 s9, s2, s4
; GFX8-NEXT: s_mul_i32 s10, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_add_u32 s9, s9, s10
-; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
-; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: s_mul_i32 s11, s0, s6
-; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: s_add_u32 s9, s9, s11
+; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
+; GFX8-NEXT: s_cselect_b32 s10, 1, 0
+; GFX8-NEXT: s_add_u32 s9, s9, s11
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
-; GFX8-NEXT: s_and_b32 s11, s11, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2
; GFX8-NEXT: s_add_i32 s10, s10, s11
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -769,37 +757,30 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s10, s0, s5
+; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4
; GFX9-NEXT: s_add_u32 s9, s9, s10
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
-; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4
-; GFX9-NEXT: s_and_b32 s10, s10, 1
; GFX9-NEXT: s_add_u32 s9, s9, s11
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
-; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s10, s10, s11
; GFX9-NEXT: s_mul_i32 s11, s2, s4
; GFX9-NEXT: s_mul_i32 s12, s1, s5
+; GFX9-NEXT: s_mul_i32 s13, s0, s6
; GFX9-NEXT: s_add_u32 s11, s11, s12
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
-; GFX9-NEXT: s_mul_i32 s13, s0, s6
-; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_add_u32 s11, s11, s13
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s15
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
-; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s12, s12, s11
; GFX9-NEXT: s_mul_i32 s3, s3, s4
; GFX9-NEXT: s_mul_i32 s11, s2, s5
@@ -828,52 +809,45 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX10-NEXT: s_mul_hi_u32 s10, s0, s4
; GFX10-NEXT: s_add_u32 s8, s8, s9
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: s_mul_i32 s11, s1, s5
-; GFX10-NEXT: s_and_b32 s9, s9, 1
; GFX10-NEXT: s_add_u32 s8, s8, s10
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
-; GFX10-NEXT: s_mul_i32 s12, s0, s6
-; GFX10-NEXT: s_and_b32 s10, s10, 1
-; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4
+; GFX10-NEXT: s_mul_i32 s11, s1, s5
; GFX10-NEXT: s_add_i32 s9, s9, s10
; GFX10-NEXT: s_mul_i32 s10, s2, s4
-; GFX10-NEXT: s_mul_i32 s3, s3, s4
+; GFX10-NEXT: s_mul_i32 s12, s0, s6
; GFX10-NEXT: s_add_u32 s10, s10, s11
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10-NEXT: s_mul_i32 s7, s0, s7
-; GFX10-NEXT: s_and_b32 s11, s11, 1
; GFX10-NEXT: s_add_u32 s10, s10, s12
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_and_b32 s12, s12, 1
+; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s13, s0, s5
-; GFX10-NEXT: s_and_b32 s12, s12, 1
+; GFX10-NEXT: s_mul_hi_u32 s14, s0, s5
; GFX10-NEXT: s_add_i32 s11, s11, s12
-; GFX10-NEXT: s_add_u32 s10, s10, s13
+; GFX10-NEXT: s_add_u32 s10, s10, s14
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_mul_i32 s13, s1, s6
-; GFX10-NEXT: s_and_b32 s12, s12, 1
-; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5
+; GFX10-NEXT: s_mul_i32 s3, s3, s4
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_mul_i32 s12, s2, s5
; GFX10-NEXT: s_add_u32 s9, s10, s9
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
+; GFX10-NEXT: s_mul_i32 s13, s1, s6
; GFX10-NEXT: s_add_i32 s3, s3, s12
-; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4
+; GFX10-NEXT: s_mul_i32 s7, s0, s7
; GFX10-NEXT: s_add_i32 s3, s3, s13
-; GFX10-NEXT: s_and_b32 s10, s10, 1
+; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX10-NEXT: s_add_i32 s3, s3, s7
-; GFX10-NEXT: s_add_i32 s11, s11, s10
+; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_mul_hi_u32 s3, s0, s6
; GFX10-NEXT: s_add_i32 s1, s2, s1
-; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_add_i32 s11, s11, s10
; GFX10-NEXT: s_add_i32 s1, s1, s3
-; GFX10-NEXT: s_mov_b32 s2, s9
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
; GFX10-NEXT: s_add_i32 s3, s1, s11
; GFX10-NEXT: s_mov_b32 s1, s8
+; GFX10-NEXT: s_mov_b32 s2, s9
; GFX10-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
@@ -1082,189 +1056,168 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-NEXT: s_mul_i32 s17, s1, s8
; GFX7-NEXT: s_mul_i32 s18, s16, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
-; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0
-; GFX7-NEXT: s_and_b32 s18, s18, 1
+; GFX7-NEXT: v_mov_b32_e32 v2, s1
+; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1
; GFX7-NEXT: s_mul_i32 s17, s2, s8
; GFX7-NEXT: s_mul_i32 s18, s1, s9
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: s_add_u32 s17, s17, s18
-; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8
-; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: s_mul_i32 s19, s16, s10
-; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_mov_b32_e32 v3, s9
-; GFX7-NEXT: s_add_u32 s17, s17, s19
+; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v4, s16, v3
+; GFX7-NEXT: s_cselect_b32 s18, 1, 0
+; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
-; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX7-NEXT: s_mul_i32 s17, s3, s8
-; GFX7-NEXT: s_mul_i32 s18, s2, s9
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; GFX7-NEXT: s_mul_i32 s19, s1, s10
-; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-NEXT: s_add_u32 s17, s17, s19
+; GFX7-NEXT: s_mul_i32 s17, s3, s8
+; GFX7-NEXT: s_mul_i32 s18, s2, s9
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GFX7-NEXT: s_mul_i32 s19, s1, s10
; GFX7-NEXT: v_mov_b32_e32 v4, s2
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
+; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_cselect_b32 s18, 1, 0
+; GFX7-NEXT: s_add_u32 s17, s17, s19
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_mul_i32 s20, s16, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
-; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
+; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
-; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5
; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v7, s16, v6
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8
-; GFX7-NEXT: s_mul_i32 s17, s4, s8
-; GFX7-NEXT: s_mul_i32 s18, s3, s9
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GFX7-NEXT: s_mul_i32 s19, s2, s10
-; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
+; GFX7-NEXT: s_mul_i32 s17, s4, s8
+; GFX7-NEXT: s_mul_i32 s18, s3, s9
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_mul_i32 s19, s2, s10
+; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX7-NEXT: s_mul_i32 s20, s1, s11
-; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: s_cselect_b32 s18, 1, 0
+; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX7-NEXT: s_add_u32 s17, s17, s20
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GFX7-NEXT: s_mul_i32 s20, s1, s11
; GFX7-NEXT: v_mov_b32_e32 v5, s3
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
+; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_add_u32 s17, s17, s20
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_mul_i32 s21, s16, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
-; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9
-; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11
-; GFX7-NEXT: s_mul_i32 s17, s5, s8
-; GFX7-NEXT: s_mul_i32 s18, s4, s9
; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6
-; GFX7-NEXT: s_add_u32 s17, s17, s18
+; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX7-NEXT: s_mul_i32 s19, s3, s10
-; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_mul_hi_u32 v10, s16, v9
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7
-; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_mul_i32 s17, s5, s8
+; GFX7-NEXT: s_mul_i32 s18, s4, s9
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GFX7-NEXT: s_mul_i32 s20, s2, s11
-; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: s_mul_i32 s19, s3, s10
+; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GFX7-NEXT: s_add_u32 s17, s17, s20
+; GFX7-NEXT: s_cselect_b32 s18, 1, 0
+; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GFX7-NEXT: s_and_b32 s19, s19, 1
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GFX7-NEXT: s_mul_i32 s21, s1, s12
+; GFX7-NEXT: s_mul_i32 s20, s2, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT: s_add_u32 s17, s17, s21
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GFX7-NEXT: s_mul_i32 s21, s1, s12
; GFX7-NEXT: v_mov_b32_e32 v7, s4
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
+; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_add_u32 s17, s17, s21
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_mul_i32 s22, s16, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
-; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8
-; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14
-; GFX7-NEXT: s_mul_i32 s17, s6, s8
-; GFX7-NEXT: s_mul_i32 s18, s5, s9
-; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6
-; GFX7-NEXT: s_cselect_b32 s18, 1, 0
+; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GFX7-NEXT: s_mul_i32 s19, s4, s10
-; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mov_b32_e32 v12, s12
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX7-NEXT: s_mul_i32 s20, s3, s11
-; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: s_mul_i32 s17, s6, s8
+; GFX7-NEXT: s_mul_i32 s18, s5, s9
; GFX7-NEXT: v_mul_hi_u32 v13, s16, v12
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; GFX7-NEXT: s_add_u32 s17, s17, s20
+; GFX7-NEXT: s_mul_i32 s19, s4, s10
+; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
+; GFX7-NEXT: s_cselect_b32 s18, 1, 0
+; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GFX7-NEXT: s_mul_i32 s21, s2, s12
+; GFX7-NEXT: s_mul_i32 s20, s3, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; GFX7-NEXT: s_add_u32 s17, s17, s21
+; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GFX7-NEXT: s_and_b32 s19, s19, 1
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GFX7-NEXT: s_mul_i32 s22, s1, s13
+; GFX7-NEXT: s_mul_i32 s21, s2, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX7-NEXT: s_add_u32 s17, s17, s22
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GFX7-NEXT: s_mul_i32 s22, s1, s13
; GFX7-NEXT: v_mov_b32_e32 v8, s5
-; GFX7-NEXT: s_cselect_b32 s19, 1, 0
+; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8
-; GFX7-NEXT: s_and_b32 s19, s19, 1
+; GFX7-NEXT: s_add_u32 s17, s17, s22
+; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_mul_i32 s23, s16, s14
; GFX7-NEXT: s_add_i32 s18, s18, s19
-; GFX7-NEXT: s_add_u32 s17, s17, s23
; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9
+; GFX7-NEXT: s_add_u32 s17, s17, s23
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
-; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
@@ -1342,189 +1295,168 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX8-NEXT: s_mul_i32 s17, s1, s8
; GFX8-NEXT: s_mul_i32 s18, s16, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0
-; GFX8-NEXT: s_and_b32 s18, s18, 1
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1
; GFX8-NEXT: s_mul_i32 s17, s2, s8
; GFX8-NEXT: s_mul_i32 s18, s1, s9
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_add_u32 s17, s17, s18
-; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: s_mul_i32 s19, s16, s10
-; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_add_u32 s17, s17, s19
+; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v4, s16, v3
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
-; GFX8-NEXT: s_mul_i32 s17, s3, s8
-; GFX8-NEXT: s_mul_i32 s18, s2, s9
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
-; GFX8-NEXT: s_mul_i32 s19, s1, s10
-; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: s_add_u32 s17, s17, s19
+; GFX8-NEXT: s_mul_i32 s17, s3, s8
+; GFX8-NEXT: s_mul_i32 s18, s2, s9
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
+; GFX8-NEXT: s_mul_i32 s19, s1, s10
; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_add_u32 s17, s17, s19
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_mul_i32 s20, s16, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
-; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
+; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v7, s16, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8
-; GFX8-NEXT: s_mul_i32 s17, s4, s8
-; GFX8-NEXT: s_mul_i32 s18, s3, s9
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5
-; GFX8-NEXT: s_mul_i32 s19, s2, s10
-; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_mul_i32 s17, s4, s8
+; GFX8-NEXT: s_mul_i32 s18, s3, s9
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_mul_i32 s19, s2, s10
+; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: s_mul_i32 s20, s1, s11
-; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: s_add_u32 s17, s17, s20
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT: s_mul_i32 s20, s1, s11
; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_add_u32 s17, s17, s20
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_mul_i32 s21, s16, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11
-; GFX8-NEXT: s_mul_i32 s17, s5, s8
-; GFX8-NEXT: s_mul_i32 s18, s4, s9
; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6
-; GFX8-NEXT: s_add_u32 s17, s17, s18
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: s_mul_i32 s19, s3, s10
-; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_mul_hi_u32 v10, s16, v9
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7
-; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_mul_i32 s17, s5, s8
+; GFX8-NEXT: s_mul_i32 s18, s4, s9
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: s_mul_i32 s20, s2, s11
-; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: s_mul_i32 s19, s3, s10
+; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
-; GFX8-NEXT: s_add_u32 s17, s17, s20
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: s_and_b32 s19, s19, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: s_mul_i32 s21, s1, s12
+; GFX8-NEXT: s_mul_i32 s20, s2, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: s_add_u32 s17, s17, s21
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: s_mul_i32 s21, s1, s12
; GFX8-NEXT: v_mov_b32_e32 v7, s4
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_add_u32 s17, s17, s21
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_mul_i32 s22, s16, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8
-; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14
-; GFX8-NEXT: s_mul_i32 s17, s6, s8
-; GFX8-NEXT: s_mul_i32 s18, s5, s9
-; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT: s_mul_i32 s19, s4, s10
-; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: s_mul_i32 s20, s3, s11
-; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: s_mul_i32 s17, s6, s8
+; GFX8-NEXT: s_mul_i32 s18, s5, s9
; GFX8-NEXT: v_mul_hi_u32 v13, s16, v12
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
-; GFX8-NEXT: s_add_u32 s17, s17, s20
+; GFX8-NEXT: s_mul_i32 s19, s4, s10
+; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT: s_mul_i32 s21, s2, s12
+; GFX8-NEXT: s_mul_i32 s20, s3, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13
-; GFX8-NEXT: s_add_u32 s17, s17, s21
+; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
-; GFX8-NEXT: s_and_b32 s19, s19, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT: s_mul_i32 s22, s1, s13
+; GFX8-NEXT: s_mul_i32 s21, s2, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: s_add_u32 s17, s17, s22
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT: s_mul_i32 s22, s1, s13
; GFX8-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8
-; GFX8-NEXT: s_and_b32 s19, s19, 1
+; GFX8-NEXT: s_add_u32 s17, s17, s22
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_mul_i32 s23, s16, s14
; GFX8-NEXT: s_add_i32 s18, s18, s19
-; GFX8-NEXT: s_add_u32 s17, s17, s23
; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9
+; GFX8-NEXT: s_add_u32 s17, s17, s23
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
@@ -1599,233 +1531,186 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX9-NEXT: s_mov_b32 s16, s0
; GFX9-NEXT: s_mul_i32 s17, s1, s8
; GFX9-NEXT: s_mul_i32 s18, s16, s9
+; GFX9-NEXT: s_mul_hi_u32 s19, s16, s8
; GFX9-NEXT: s_add_u32 s17, s17, s18
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
-; GFX9-NEXT: s_mul_hi_u32 s19, s16, s8
-; GFX9-NEXT: s_and_b32 s18, s18, 1
; GFX9-NEXT: s_add_u32 s17, s17, s19
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s18, s18, s19
; GFX9-NEXT: s_mul_i32 s19, s2, s8
; GFX9-NEXT: s_mul_i32 s20, s1, s9
+; GFX9-NEXT: s_mul_i32 s21, s16, s10
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_mul_i32 s21, s16, s10
-; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s18, s19, s18
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s20, s20, s19
; GFX9-NEXT: s_mul_i32 s19, s3, s8
; GFX9-NEXT: s_mul_i32 s21, s2, s9
+; GFX9-NEXT: s_mul_i32 s22, s1, s10
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_mul_i32 s22, s1, s10
-; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_i32 s23, s16, s11
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s24
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s25
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s16, s10
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s26
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_i32 s21, s21, s20
; GFX9-NEXT: s_mul_i32 s20, s4, s8
; GFX9-NEXT: s_mul_i32 s22, s3, s9
+; GFX9-NEXT: s_mul_i32 s23, s2, s10
; GFX9-NEXT: s_add_u32 s20, s20, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_mul_i32 s23, s2, s10
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_u32 s20, s20, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s24, s1, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s24
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s25, s16, s12
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s25
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s26
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s27
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s28
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s16, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s29
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s22, s22, s21
; GFX9-NEXT: s_mul_i32 s21, s5, s8
; GFX9-NEXT: s_mul_i32 s23, s4, s9
+; GFX9-NEXT: s_mul_i32 s24, s3, s10
; GFX9-NEXT: s_add_u32 s21, s21, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_mul_i32 s24, s3, s10
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_u32 s21, s21, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s25, s2, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s25
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s26, s1, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s26
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s27, s16, s13
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s27
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s28
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s29
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s30
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s31
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s16, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s33
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
-; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s23, s23, s22
; GFX9-NEXT: s_mul_i32 s22, s6, s8
; GFX9-NEXT: s_mul_i32 s24, s5, s9
+; GFX9-NEXT: s_mul_i32 s25, s4, s10
; GFX9-NEXT: s_add_u32 s22, s22, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_mul_i32 s25, s4, s10
-; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_u32 s22, s22, s25
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s26, s3, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s26
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s27, s2, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s27
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s28, s1, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s28
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s29, s16, s14
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s29
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s30
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s31
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s33
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s34
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s35
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s36, s16, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s36
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
-; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s24, s24, s23
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_mul_i32 s23, s6, s9
@@ -1873,268 +1758,221 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX10-NEXT: s_mul_hi_u32 s18, s0, s8
; GFX10-NEXT: s_add_u32 s16, s16, s17
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
-; GFX10-NEXT: s_mul_i32 s19, s1, s9
-; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_add_u32 s16, s16, s18
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
-; GFX10-NEXT: s_mul_i32 s20, s0, s10
-; GFX10-NEXT: s_and_b32 s18, s18, 1
-; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8
+; GFX10-NEXT: s_mul_i32 s19, s1, s9
; GFX10-NEXT: s_add_i32 s17, s17, s18
; GFX10-NEXT: s_mul_i32 s18, s2, s8
-; GFX10-NEXT: s_mul_i32 s22, s0, s11
+; GFX10-NEXT: s_mul_i32 s20, s0, s10
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
-; GFX10-NEXT: s_mul_i32 s23, s1, s11
-; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_mul_i32 s24, s0, s12
-; GFX10-NEXT: s_and_b32 s20, s20, 1
-; GFX10-NEXT: s_mul_i32 s25, s4, s9
+; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s21, s0, s9
-; GFX10-NEXT: s_and_b32 s20, s20, 1
-; GFX10-NEXT: s_mul_i32 s26, s2, s11
+; GFX10-NEXT: s_mul_hi_u32 s22, s0, s9
; GFX10-NEXT: s_add_i32 s19, s19, s20
-; GFX10-NEXT: s_add_u32 s18, s18, s21
+; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s1, s10
-; GFX10-NEXT: s_and_b32 s20, s20, 1
-; GFX10-NEXT: s_mul_i32 s27, s0, s13
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s17, s18, s17
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s2, s9
-; GFX10-NEXT: s_and_b32 s18, s18, 1
-; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9
; GFX10-NEXT: s_add_i32 s19, s19, s18
; GFX10-NEXT: s_mul_i32 s18, s3, s8
-; GFX10-NEXT: s_mul_i32 s7, s7, s8
+; GFX10-NEXT: s_mul_i32 s22, s0, s11
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_mul_i32 s15, s0, s15
-; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_and_b32 s21, s21, 1
+; GFX10-NEXT: s_mul_hi_u32 s23, s2, s8
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s22, s2, s8
-; GFX10-NEXT: s_and_b32 s21, s21, 1
+; GFX10-NEXT: s_mul_hi_u32 s24, s1, s9
; GFX10-NEXT: s_add_i32 s20, s20, s21
-; GFX10-NEXT: s_add_u32 s18, s18, s22
+; GFX10-NEXT: s_add_u32 s18, s18, s23
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s22, s1, s9
-; GFX10-NEXT: s_and_b32 s21, s21, 1
+; GFX10-NEXT: s_mul_hi_u32 s25, s0, s10
; GFX10-NEXT: s_add_i32 s20, s20, s21
-; GFX10-NEXT: s_add_u32 s18, s18, s22
+; GFX10-NEXT: s_add_u32 s18, s18, s24
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s22, s0, s10
-; GFX10-NEXT: s_and_b32 s21, s21, 1
+; GFX10-NEXT: s_mul_i32 s22, s2, s10
; GFX10-NEXT: s_add_i32 s20, s20, s21
-; GFX10-NEXT: s_add_u32 s18, s18, s22
+; GFX10-NEXT: s_add_u32 s18, s18, s25
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_mul_i32 s22, s2, s10
-; GFX10-NEXT: s_and_b32 s21, s21, 1
+; GFX10-NEXT: s_mul_i32 s23, s1, s11
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s3, s9
-; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_i32 s20, s20, s19
; GFX10-NEXT: s_mul_i32 s19, s4, s8
+; GFX10-NEXT: s_mul_i32 s24, s0, s12
; GFX10-NEXT: s_add_u32 s19, s19, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_u32 s19, s19, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_hi_u32 s25, s3, s8
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s23, s3, s8
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_hi_u32 s26, s2, s9
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s24, s2, s9
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_hi_u32 s27, s1, s10
; GFX10-NEXT: s_add_i32 s21, s21, s22
-; GFX10-NEXT: s_add_u32 s19, s19, s23
+; GFX10-NEXT: s_add_u32 s19, s19, s25
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s23, s1, s10
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_hi_u32 s28, s0, s11
; GFX10-NEXT: s_add_i32 s21, s21, s22
-; GFX10-NEXT: s_add_u32 s19, s19, s24
+; GFX10-NEXT: s_add_u32 s19, s19, s26
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s24, s0, s11
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_i32 s23, s3, s10
; GFX10-NEXT: s_add_i32 s21, s21, s22
-; GFX10-NEXT: s_add_u32 s19, s19, s23
+; GFX10-NEXT: s_add_u32 s19, s19, s27
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_i32 s23, s5, s8
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_i32 s24, s2, s11
; GFX10-NEXT: s_add_i32 s21, s21, s22
-; GFX10-NEXT: s_add_u32 s19, s19, s24
+; GFX10-NEXT: s_add_u32 s19, s19, s28
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_i32 s24, s3, s10
-; GFX10-NEXT: s_and_b32 s22, s22, 1
+; GFX10-NEXT: s_mul_i32 s25, s1, s12
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_mul_i32 s22, s1, s12
-; GFX10-NEXT: s_and_b32 s20, s20, 1
+; GFX10-NEXT: s_mul_i32 s22, s4, s9
; GFX10-NEXT: s_add_i32 s21, s21, s20
-; GFX10-NEXT: s_add_u32 s23, s23, s25
-; GFX10-NEXT: s_cselect_b32 s25, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
-; GFX10-NEXT: s_and_b32 s25, s25, 1
-; GFX10-NEXT: s_add_u32 s23, s23, s24
-; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_and_b32 s24, s24, 1
-; GFX10-NEXT: s_add_i32 s24, s25, s24
-; GFX10-NEXT: s_add_u32 s23, s23, s26
-; GFX10-NEXT: s_cselect_b32 s25, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s26, s2, s10
-; GFX10-NEXT: s_and_b32 s25, s25, 1
-; GFX10-NEXT: s_add_i32 s24, s24, s25
-; GFX10-NEXT: s_add_u32 s22, s23, s22
-; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s25, s1, s11
-; GFX10-NEXT: s_and_b32 s23, s23, 1
-; GFX10-NEXT: s_add_i32 s23, s24, s23
-; GFX10-NEXT: s_add_u32 s22, s22, s27
-; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s27, s0, s12
-; GFX10-NEXT: s_and_b32 s24, s24, 1
-; GFX10-NEXT: s_add_i32 s23, s23, s24
-; GFX10-NEXT: s_add_u32 s20, s22, s20
+; GFX10-NEXT: s_mul_i32 s20, s5, s8
+; GFX10-NEXT: s_mul_i32 s26, s0, s13
+; GFX10-NEXT: s_add_u32 s20, s20, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_mul_i32 s24, s6, s8
-; GFX10-NEXT: s_and_b32 s22, s22, 1
-; GFX10-NEXT: s_add_i32 s22, s23, s22
-; GFX10-NEXT: s_add_u32 s20, s20, s28
+; GFX10-NEXT: s_add_u32 s20, s20, s23
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_mul_i32 s28, s5, s9
-; GFX10-NEXT: s_and_b32 s23, s23, 1
+; GFX10-NEXT: s_mul_hi_u32 s27, s4, s8
; GFX10-NEXT: s_add_i32 s22, s22, s23
-; GFX10-NEXT: s_add_u32 s20, s20, s26
+; GFX10-NEXT: s_add_u32 s20, s20, s24
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_mul_i32 s26, s4, s10
-; GFX10-NEXT: s_and_b32 s23, s23, 1
+; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s25
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_mul_i32 s25, s3, s11
-; GFX10-NEXT: s_and_b32 s23, s23, 1
+; GFX10-NEXT: s_mul_hi_u32 s29, s2, s10
+; GFX10-NEXT: s_add_i32 s22, s22, s23
+; GFX10-NEXT: s_add_u32 s20, s20, s26
+; GFX10-NEXT: s_cselect_b32 s23, 1, 0
+; GFX10-NEXT: s_mul_hi_u32 s30, s1, s11
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s27
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_mul_i32 s27, s2, s12
-; GFX10-NEXT: s_and_b32 s23, s23, 1
+; GFX10-NEXT: s_mul_hi_u32 s31, s0, s12
+; GFX10-NEXT: s_add_i32 s22, s22, s23
+; GFX10-NEXT: s_add_u32 s20, s20, s28
+; GFX10-NEXT: s_cselect_b32 s23, 1, 0
+; GFX10-NEXT: s_mul_i32 s24, s4, s10
+; GFX10-NEXT: s_add_i32 s22, s22, s23
+; GFX10-NEXT: s_add_u32 s20, s20, s29
+; GFX10-NEXT: s_cselect_b32 s23, 1, 0
+; GFX10-NEXT: s_mul_i32 s25, s3, s11
+; GFX10-NEXT: s_add_i32 s22, s22, s23
+; GFX10-NEXT: s_add_u32 s20, s20, s30
+; GFX10-NEXT: s_cselect_b32 s23, 1, 0
+; GFX10-NEXT: s_mul_i32 s26, s2, s12
+; GFX10-NEXT: s_add_i32 s22, s22, s23
+; GFX10-NEXT: s_add_u32 s20, s20, s31
+; GFX10-NEXT: s_cselect_b32 s23, 1, 0
+; GFX10-NEXT: s_mul_i32 s27, s1, s13
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_mul_i32 s23, s1, s13
-; GFX10-NEXT: s_and_b32 s21, s21, 1
+; GFX10-NEXT: s_mul_i32 s23, s5, s9
; GFX10-NEXT: s_add_i32 s22, s22, s21
-; GFX10-NEXT: s_add_u32 s21, s24, s28
-; GFX10-NEXT: s_cselect_b32 s24, 1, 0
+; GFX10-NEXT: s_mul_i32 s21, s6, s8
; GFX10-NEXT: s_mul_i32 s28, s0, s14
-; GFX10-NEXT: s_and_b32 s24, s24, 1
-; GFX10-NEXT: s_add_u32 s21, s21, s26
-; GFX10-NEXT: s_cselect_b32 s26, 1, 0
-; GFX10-NEXT: s_and_b32 s26, s26, 1
-; GFX10-NEXT: s_add_i32 s24, s24, s26
-; GFX10-NEXT: s_add_u32 s21, s21, s25
-; GFX10-NEXT: s_cselect_b32 s25, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s26, s5, s8
-; GFX10-NEXT: s_and_b32 s25, s25, 1
-; GFX10-NEXT: s_add_i32 s24, s24, s25
-; GFX10-NEXT: s_add_u32 s21, s21, s27
-; GFX10-NEXT: s_cselect_b32 s25, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s27, s4, s9
-; GFX10-NEXT: s_and_b32 s25, s25, 1
-; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s23
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s25, s3, s10
-; GFX10-NEXT: s_and_b32 s23, s23, 1
-; GFX10-NEXT: s_add_i32 s23, s24, s23
-; GFX10-NEXT: s_add_u32 s21, s21, s28
+; GFX10-NEXT: s_add_u32 s21, s21, s24
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s28, s2, s11
-; GFX10-NEXT: s_and_b32 s24, s24, 1
+; GFX10-NEXT: s_mul_hi_u32 s29, s5, s8
+; GFX10-NEXT: s_add_i32 s23, s23, s24
+; GFX10-NEXT: s_add_u32 s21, s21, s25
+; GFX10-NEXT: s_cselect_b32 s24, 1, 0
+; GFX10-NEXT: s_mul_hi_u32 s30, s4, s9
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s26, s1, s12
-; GFX10-NEXT: s_and_b32 s24, s24, 1
+; GFX10-NEXT: s_mul_hi_u32 s31, s3, s10
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_hi_u32 s27, s0, s13
-; GFX10-NEXT: s_and_b32 s24, s24, 1
+; GFX10-NEXT: s_mul_hi_u32 s33, s2, s11
; GFX10-NEXT: s_add_i32 s23, s23, s24
-; GFX10-NEXT: s_add_u32 s21, s21, s25
+; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_i32 s25, s6, s9
-; GFX10-NEXT: s_and_b32 s24, s24, 1
-; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8
+; GFX10-NEXT: s_mul_hi_u32 s34, s1, s12
; GFX10-NEXT: s_add_i32 s23, s23, s24
-; GFX10-NEXT: s_add_u32 s21, s21, s28
+; GFX10-NEXT: s_add_u32 s21, s21, s29
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_and_b32 s24, s24, 1
+; GFX10-NEXT: s_mul_hi_u32 s35, s0, s13
; GFX10-NEXT: s_add_i32 s23, s23, s24
-; GFX10-NEXT: s_add_u32 s21, s21, s26
+; GFX10-NEXT: s_add_u32 s21, s21, s30
+; GFX10-NEXT: s_cselect_b32 s24, 1, 0
+; GFX10-NEXT: s_mul_i32 s7, s7, s8
+; GFX10-NEXT: s_add_i32 s23, s23, s24
+; GFX10-NEXT: s_add_u32 s21, s21, s31
+; GFX10-NEXT: s_cselect_b32 s24, 1, 0
+; GFX10-NEXT: s_mul_i32 s25, s5, s10
+; GFX10-NEXT: s_add_i32 s23, s23, s24
+; GFX10-NEXT: s_add_u32 s21, s21, s33
+; GFX10-NEXT: s_cselect_b32 s24, 1, 0
+; GFX10-NEXT: s_mul_i32 s15, s0, s15
+; GFX10-NEXT: s_add_i32 s23, s23, s24
+; GFX10-NEXT: s_add_u32 s21, s21, s34
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_i32 s26, s5, s10
-; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX10-NEXT: s_add_i32 s23, s23, s24
-; GFX10-NEXT: s_add_u32 s21, s21, s27
+; GFX10-NEXT: s_add_u32 s21, s21, s35
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
-; GFX10-NEXT: s_mul_i32 s27, s4, s11
-; GFX10-NEXT: s_and_b32 s24, s24, 1
-; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s23, s23, s24
+; GFX10-NEXT: s_mul_i32 s24, s6, s9
; GFX10-NEXT: s_add_u32 s21, s21, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
+; GFX10-NEXT: s_add_i32 s7, s7, s24
+; GFX10-NEXT: s_mul_i32 s24, s4, s11
; GFX10-NEXT: s_add_i32 s7, s7, s25
-; GFX10-NEXT: s_mul_i32 s24, s3, s12
-; GFX10-NEXT: s_add_i32 s7, s7, s26
-; GFX10-NEXT: s_mul_i32 s25, s2, s13
-; GFX10-NEXT: s_add_i32 s7, s7, s27
-; GFX10-NEXT: s_mul_i32 s26, s1, s14
+; GFX10-NEXT: s_mul_i32 s25, s3, s12
; GFX10-NEXT: s_add_i32 s7, s7, s24
-; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11
+; GFX10-NEXT: s_mul_i32 s24, s2, s13
; GFX10-NEXT: s_add_i32 s7, s7, s25
-; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12
-; GFX10-NEXT: s_add_i32 s7, s7, s26
-; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13
+; GFX10-NEXT: s_mul_i32 s25, s1, s14
+; GFX10-NEXT: s_add_i32 s7, s7, s24
+; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8
+; GFX10-NEXT: s_add_i32 s7, s7, s25
+; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s7, s7, s15
+; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX10-NEXT: s_add_i32 s6, s7, s6
+; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX10-NEXT: s_add_i32 s5, s6, s5
-; GFX10-NEXT: s_mov_b32 s6, s21
+; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX10-NEXT: s_add_i32 s4, s5, s4
-; GFX10-NEXT: s_mov_b32 s5, s20
+; GFX10-NEXT: s_add_i32 s23, s23, s22
; GFX10-NEXT: s_add_i32 s3, s4, s3
-; GFX10-NEXT: s_mul_hi_u32 s4, s0, s14
+; GFX10-NEXT: s_mov_b32 s4, s19
; GFX10-NEXT: s_add_i32 s2, s3, s2
-; GFX10-NEXT: s_and_b32 s3, s22, 1
+; GFX10-NEXT: s_mul_hi_u32 s3, s0, s14
; GFX10-NEXT: s_add_i32 s1, s2, s1
-; GFX10-NEXT: s_add_i32 s23, s23, s3
-; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s8
+; GFX10-NEXT: s_add_i32 s1, s1, s3
+; GFX10-NEXT: s_mov_b32 s2, s17
; GFX10-NEXT: s_add_i32 s7, s1, s23
; GFX10-NEXT: s_mov_b32 s1, s16
-; GFX10-NEXT: s_mov_b32 s2, s17
; GFX10-NEXT: s_mov_b32 s3, s18
-; GFX10-NEXT: s_mov_b32 s4, s19
+; GFX10-NEXT: s_mov_b32 s5, s20
+; GFX10-NEXT: s_mov_b32 s6, s21
; GFX10-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 642b20879ea04..db72cf406c9cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4217,9 +4217,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_saddsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s4, s0, s2
-; GFX6-NEXT: s_cselect_b32 s5, 1, 0
-; GFX6-NEXT: s_and_b32 s5, s5, 1
-; GFX6-NEXT: s_cmp_lg_u32 s5, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_addc_u32 s5, s1, s3
; GFX6-NEXT: v_mov_b32_e32 v1, s1
@@ -4243,9 +4240,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-LABEL: s_saddsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s2
-; GFX8-NEXT: s_cselect_b32 s5, 1, 0
-; GFX8-NEXT: s_and_b32 s5, s5, 1
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s5, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -4269,9 +4263,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-LABEL: s_saddsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s2
-; GFX9-NEXT: s_cselect_b32 s5, 1, 0
-; GFX9-NEXT: s_and_b32 s5, s5, 1
-; GFX9-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s5, s1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -4295,15 +4286,12 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-LABEL: s_saddsat_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s4, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_addc_u32 s5, s1, s3
-; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
+; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_xor_b32 s2, s2, s1
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
@@ -4559,9 +4547,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-LABEL: s_saddsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s8, s0, s4
-; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: s_and_b32 s9, s9, 1
-; GFX6-NEXT: s_cmp_lg_u32 s9, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_addc_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s1
@@ -4572,16 +4557,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_addc_u32 s1, s4, s5
-; GFX6-NEXT: s_add_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: s_and_b32 s1, s1, 1
+; GFX6-NEXT: s_addc_u32 s1, s4, s5
; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: s_add_u32 s0, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: v_mov_b32_e32 v3, s9
; GFX6-NEXT: s_addc_u32 s1, s3, s7
; GFX6-NEXT: v_mov_b32_e32 v1, s3
@@ -4608,9 +4590,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-LABEL: s_saddsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s8, s0, s4
-; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_and_b32 s9, s9, 1
-; GFX8-NEXT: s_cmp_lg_u32 s9, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_addc_u32 s9, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -4621,16 +4600,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX8-NEXT: s_brev_b32 s5, 1
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
-; GFX8-NEXT: s_addc_u32 s1, s4, s5
-; GFX8-NEXT: s_add_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s8
-; GFX8-NEXT: s_and_b32 s1, s1, 1
+; GFX8-NEXT: s_addc_u32 s1, s4, s5
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: s_add_u32 s0, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_addc_u32 s1, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -4657,9 +4633,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-LABEL: s_saddsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s8, s0, s4
-; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
-; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_addc_u32 s9, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -4670,16 +4643,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: s_brev_b32 s5, 1
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_addc_u32 s1, s4, s5
-; GFX9-NEXT: s_add_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NEXT: s_and_b32 s1, s1, 1
+; GFX9-NEXT: s_addc_u32 s1, s4, s5
; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: s_add_u32 s0, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_addc_u32 s1, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s3
@@ -4706,32 +4676,26 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-LABEL: s_saddsat_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s8, s0, s4
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
+; GFX10-NEXT: s_addc_u32 s9, s1, s5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
-; GFX10-NEXT: s_and_b32 s9, s9, 1
+; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
; GFX10-NEXT: s_mov_b32 s11, 0
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
+; GFX10-NEXT: s_ashr_i32 s0, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_addc_u32 s9, s1, s5
; GFX10-NEXT: s_brev_b32 s10, 1
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
-; GFX10-NEXT: s_ashr_i32 s0, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_xor_b32 s8, s4, s1
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8
; GFX10-NEXT: s_addc_u32 s1, s0, s10
; GFX10-NEXT: s_add_u32 s4, s2, s6
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_addc_u32 s5, s3, s7
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
; GFX10-NEXT: v_mov_b32_e32 v3, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
; GFX10-NEXT: s_xor_b32 s2, s3, s2
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2
@@ -4750,19 +4714,10 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_saddsat_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s4, s0, s4
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: s_and_b32 s8, s8, 1
-; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_addc_u32 s5, s1, s5
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: s_and_b32 s8, s8, 1
-; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_addc_u32 s8, s2, s6
-; GFX6-NEXT: s_cselect_b32 s9, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: s_and_b32 s9, s9, 1
+; GFX6-NEXT: s_addc_u32 s5, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_cmp_lg_u32 s9, 0
+; GFX6-NEXT: s_addc_u32 s8, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
; GFX6-NEXT: s_addc_u32 s9, s3, s7
@@ -4779,15 +4734,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: s_ashr_i32 s0, s9, 31
; GFX6-NEXT: s_mov_b32 s1, 0
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_addc_u32 s1, s0, 0
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s2, s0, 0
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_and_b32 s3, s3, 1
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
@@ -4812,18 +4761,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-LABEL: s_saddsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s4
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_and_b32 s8, s8, 1
-; GFX8-NEXT: s_cmp_lg_u32 s8, 0
; GFX8-NEXT: s_addc_u32 s5, s1, s5
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_and_b32 s8, s8, 1
-; GFX8-NEXT: s_cmp_lg_u32 s8, 0
-; GFX8-NEXT: s_addc_u32 s8, s2, s6
-; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_and_b32 s9, s9, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_cmp_lg_u32 s9, 0
+; GFX8-NEXT: s_addc_u32 s8, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_addc_u32 s9, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s2
@@ -4845,17 +4785,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX8-NEXT: s_ashr_i32 s0, s9, 31
; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_addc_u32 s1, s0, 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-NEXT: s_addc_u32 s2, s0, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
@@ -4880,18 +4814,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-LABEL: s_saddsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s4
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_and_b32 s8, s8, 1
-; GFX9-NEXT: s_cmp_lg_u32 s8, 0
; GFX9-NEXT: s_addc_u32 s5, s1, s5
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_and_b32 s8, s8, 1
-; GFX9-NEXT: s_cmp_lg_u32 s8, 0
-; GFX9-NEXT: s_addc_u32 s8, s2, s6
-; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: s_addc_u32 s8, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_addc_u32 s9, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -4913,17 +4838,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX9-NEXT: s_ashr_i32 s0, s9, 31
; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_addc_u32 s1, s0, 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_addc_u32 s2, s0, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
@@ -4948,60 +4867,45 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-LABEL: s_saddsat_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s4, s0, s4
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s10, s[6:7], 0
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s5, s1, s5
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s8, s2, s6
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
; GFX10-NEXT: s_addc_u32 s9, s3, s7
; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[8:9], s[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v3, s9
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: s_and_b32 s0, 1, s10
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
; GFX10-NEXT: s_mov_b32 s1, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: s_addc_u32 s1, s0, 0
-; GFX10-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
-; GFX10-NEXT: s_and_b32 s2, s2, 1
+; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_addc_u32 s2, s0, 0
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s1, v2
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
@@ -5527,19 +5431,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-LABEL: s_saddsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s8, s0, s8
-; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_and_b32 s16, s16, 1
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_addc_u32 s9, s1, s9
-; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_and_b32 s16, s16, 1
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_addc_u32 s16, s2, s10
-; GFX6-NEXT: s_cselect_b32 s17, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: s_and_b32 s17, s17, 1
+; GFX6-NEXT: s_addc_u32 s9, s1, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
+; GFX6-NEXT: s_addc_u32 s16, s2, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
; GFX6-NEXT: s_addc_u32 s17, s3, s11
@@ -5551,50 +5446,35 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT: s_brev_b32 s10, 1
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX6-NEXT: s_ashr_i32 s0, s17, 31
; GFX6-NEXT: s_mov_b32 s1, 0
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_addc_u32 s1, s0, 0
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_brev_b32 s10, 1
; GFX6-NEXT: s_addc_u32 s2, s0, 0
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
-; GFX6-NEXT: s_and_b32 s3, s3, 1
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_addc_u32 s3, s0, s10
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: s_add_u32 s0, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
-; GFX6-NEXT: s_and_b32 s1, s1, 1
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_addc_u32 s1, s5, s13
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
; GFX6-NEXT: v_mov_b32_e32 v3, s8
; GFX6-NEXT: v_mov_b32_e32 v4, s9
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: s_addc_u32 s2, s6, s14
; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_add_u32 s0, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: s_and_b32 s3, s3, 1
+; GFX6-NEXT: s_addc_u32 s1, s5, s13
; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_addc_u32 s2, s6, s14
; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_addc_u32 s3, s7, s15
@@ -5611,15 +5491,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_addc_u32 s5, s4, 0
-; GFX6-NEXT: s_cselect_b32 s6, 1, 0
-; GFX6-NEXT: s_and_b32 s6, s6, 1
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s6, s4, 0
-; GFX6-NEXT: s_cselect_b32 s7, 1, 0
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_and_b32 s7, s7, 1
-; GFX6-NEXT: s_cmp_lg_u32 s7, 0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_addc_u32 s7, s4, s10
; GFX6-NEXT: v_mov_b32_e32 v1, s4
@@ -5648,18 +5522,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-LABEL: s_saddsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s8, s0, s8
-; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_and_b32 s16, s16, 1
-; GFX8-NEXT: s_cmp_lg_u32 s16, 0
; GFX8-NEXT: s_addc_u32 s9, s1, s9
-; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_and_b32 s16, s16, 1
-; GFX8-NEXT: s_cmp_lg_u32 s16, 0
-; GFX8-NEXT: s_addc_u32 s16, s2, s10
-; GFX8-NEXT: s_cselect_b32 s17, 1, 0
-; GFX8-NEXT: s_and_b32 s17, s17, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_cmp_lg_u32 s17, 0
+; GFX8-NEXT: s_addc_u32 s16, s2, s10
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_addc_u32 s17, s3, s11
; GFX8-NEXT: v_mov_b32_e32 v0, s2
@@ -5681,46 +5546,31 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX8-NEXT: s_ashr_i32 s0, s17, 31
; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_addc_u32 s1, s0, 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
-; GFX8-NEXT: s_addc_u32 s2, s0, 0
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
; GFX8-NEXT: s_brev_b32 s10, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: s_addc_u32 s2, s0, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_addc_u32 s3, s0, s10
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: s_add_u32 s0, s4, s12
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_addc_u32 s1, s5, s13
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s8
; GFX8-NEXT: v_mov_b32_e32 v4, s9
-; GFX8-NEXT: s_addc_u32 s2, s6, s14
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: v_mov_b32_e32 v3, s17
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_add_u32 s0, s4, s12
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX8-NEXT: s_and_b32 s3, s3, 1
+; GFX8-NEXT: s_addc_u32 s1, s5, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
+; GFX8-NEXT: s_addc_u32 s2, s6, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_addc_u32 s3, s7, s15
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -5742,17 +5592,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
; GFX8-NEXT: s_mov_b32 s5, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
+; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_addc_u32 s5, s4, 0
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: s_and_b32 s6, s6, 1
-; GFX8-NEXT: s_cmp_lg_u32 s6, 0
; GFX8-NEXT: s_addc_u32 s6, s4, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: s_cselect_b32 s7, 1, 0
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_and_b32 s7, s7, 1
-; GFX8-NEXT: s_cmp_lg_u32 s7, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_addc_u32 s7, s4, s10
; GFX8-NEXT: v_mov_b32_e32 v1, s4
@@ -5781,18 +5625,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-LABEL: s_saddsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s8, s0, s8
-; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_and_b32 s16, s16, 1
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
; GFX9-NEXT: s_addc_u32 s9, s1, s9
-; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_and_b32 s16, s16, 1
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_addc_u32 s16, s2, s10
-; GFX9-NEXT: s_cselect_b32 s17, 1, 0
-; GFX9-NEXT: s_and_b32 s17, s17, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_cmp_lg_u32 s17, 0
+; GFX9-NEXT: s_addc_u32 s16, s2, s10
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_addc_u32 s17, s3, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -5814,46 +5649,31 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX9-NEXT: s_ashr_i32 s0, s17, 31
; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_addc_u32 s1, s0, 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_addc_u32 s2, s0, 0
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
; GFX9-NEXT: s_brev_b32 s10, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: s_addc_u32 s2, s0, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_addc_u32 s3, s0, s10
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: s_add_u32 s0, s4, s12
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_addc_u32 s1, s5, s13
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-NEXT: s_addc_u32 s2, s6, s14
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: v_mov_b32_e32 v3, s17
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_add_u32 s0, s4, s12
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX9-NEXT: s_and_b32 s3, s3, 1
+; GFX9-NEXT: s_addc_u32 s1, s5, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
+; GFX9-NEXT: s_addc_u32 s2, s6, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_addc_u32 s3, s7, s15
; GFX9-NEXT: v_mov_b32_e32 v0, s6
@@ -5875,17 +5695,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_mov_b32 s5, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_addc_u32 s5, s4, 0
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: s_and_b32 s6, s6, 1
-; GFX9-NEXT: s_cmp_lg_u32 s6, 0
; GFX9-NEXT: s_addc_u32 s6, s4, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: s_cselect_b32 s7, 1, 0
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_and_b32 s7, s7, 1
-; GFX9-NEXT: s_cmp_lg_u32 s7, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_addc_u32 s7, s4, s10
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -5914,25 +5728,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-LABEL: s_saddsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s8, s0, s8
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: s_and_b32 s16, s16, 1
-; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_addc_u32 s9, s1, s9
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT: s_and_b32 s16, s16, 1
-; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_addc_u32 s16, s2, s10
-; GFX10-NEXT: s_cselect_b32 s17, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s17, s17, 1
-; GFX10-NEXT: s_cmp_lg_u32 s17, 0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
; GFX10-NEXT: s_addc_u32 s17, s3, s11
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT: v_mov_b32_e32 v5, s17
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s17
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: s_and_b32 s0, 1, s18
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
@@ -5940,91 +5745,70 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: s_brev_b32 s10, 1
+; GFX10-NEXT: s_ashr_i32 s2, s17, 31
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_brev_b32 s11, 1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX10-NEXT: s_ashr_i32 s0, s17, 31
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, s9
-; GFX10-NEXT: s_addc_u32 s1, s0, 0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_addc_u32 s1, s2, 0
+; GFX10-NEXT: s_addc_u32 s10, s2, 0
+; GFX10-NEXT: s_addc_u32 s3, s2, s11
+; GFX10-NEXT: s_add_u32 s12, s4, s12
+; GFX10-NEXT: s_addc_u32 s13, s5, s13
+; GFX10-NEXT: s_addc_u32 s18, s6, s14
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5]
+; GFX10-NEXT: s_addc_u32 s19, s7, s15
+; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0
+; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[6:7]
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s8
-; GFX10-NEXT: s_and_b32 s2, s2, 1
-; GFX10-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7]
+; GFX10-NEXT: s_and_b32 s0, 1, s0
+; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: s_addc_u32 s2, s0, 0
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_and_b32 s4, 1, s4
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: s_addc_u32 s3, s0, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT: s_add_u32 s0, s4, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, s9
+; GFX10-NEXT: v_mov_b32_e32 v6, s13
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v7, s19
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX10-NEXT: v_mov_b32_e32 v4, s16
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_ashr_i32 s0, s19, 31
+; GFX10-NEXT: v_xor_b32_e32 v2, v3, v2
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
-; GFX10-NEXT: s_addc_u32 s1, s5, s13
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
-; GFX10-NEXT: s_and_b32 s8, s8, 1
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v5, s12
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: s_addc_u32 s1, s0, 0
+; GFX10-NEXT: s_addc_u32 s2, s0, 0
+; GFX10-NEXT: s_addc_u32 s3, s0, s11
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, s18
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[14:15], 0
-; GFX10-NEXT: s_addc_u32 s8, s6, s14
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: v_mov_b32_e32 v7, s8
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
-; GFX10-NEXT: s_addc_u32 s9, s7, s15
-; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v8, s9
-; GFX10-NEXT: s_and_b32 s2, 1, s2
-; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, 1, s3
-; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, s3
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT: s_mov_b32 s3, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, 0, s2
-; GFX10-NEXT: s_ashr_i32 s2, s9, 31
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: v_mov_b32_e32 v6, s1
-; GFX10-NEXT: s_addc_u32 s3, s2, 0
-; GFX10-NEXT: s_cselect_b32 s4, 1, 0
-; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT: s_and_b32 s4, s4, 1
-; GFX10-NEXT: v_mov_b32_e32 v5, s0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT: s_addc_u32 s4, s2, 0
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
-; GFX10-NEXT: s_addc_u32 s1, s2, s10
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: v_readfirstlane_b32 s4, v4
-; GFX10-NEXT: v_readfirstlane_b32 s5, v5
-; GFX10-NEXT: v_readfirstlane_b32 s6, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s1, v0
+; GFX10-NEXT: v_readfirstlane_b32 s2, v3
+; GFX10-NEXT: v_readfirstlane_b32 s3, v4
+; GFX10-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
; GFX10-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0b1105fba0eba..d4378da215ee5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -208,14 +208,8 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
; CHECK-NEXT: s_ashr_i32 s8, s5, 31
; CHECK-NEXT: s_add_u32 s0, s2, s6
-; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_and_b32 s1, s1, 1
-; CHECK-NEXT: s_cmp_lg_u32 s1, 0
; CHECK-NEXT: s_addc_u32 s1, s3, s6
; CHECK-NEXT: s_add_u32 s10, s4, s8
-; CHECK-NEXT: s_cselect_b32 s3, 1, 0
-; CHECK-NEXT: s_and_b32 s3, s3, 1
-; CHECK-NEXT: s_cmp_lg_u32 s3, 0
; CHECK-NEXT: s_mov_b32 s9, s8
; CHECK-NEXT: s_addc_u32 s11, s5, s8
; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9]
@@ -226,21 +220,18 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_sub_u32 s0, 0, s10
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_and_b32 s1, s1, 1
-; CHECK-NEXT: s_cmp_lg_u32 s1, 0
+; CHECK-NEXT: s_subb_u32 s1, 0, s11
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v1, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: s_subb_u32 s1, 0, s11
-; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1
-; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, s0, v1
+; CHECK-NEXT: v_mul_lo_u32 v3, s1, v0
; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0
; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4
; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2
@@ -1196,43 +1187,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s10, 0x1000
-; GISEL-NEXT: s_add_u32 s4, s10, 0
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
; GISEL-NEXT: s_mov_b32 s6, 0
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
; GISEL-NEXT: s_sub_u32 s4, 0, s8
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: s_subb_u32 s5, 0, s9
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
@@ -1256,7 +1242,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
@@ -1327,15 +1312,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
-; GISEL-NEXT: s_and_b32 s5, s5, 1
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
@@ -1347,25 +1329,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
; GISEL-NEXT: s_sub_u32 s4, 0, s6
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: s_subb_u32 s5, 0, s7
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -1912,43 +1891,38 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
-; GISEL-NEXT: s_add_u32 s4, s10, 0
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
; GISEL-NEXT: s_mov_b32 s6, 0
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
; GISEL-NEXT: s_sub_u32 s4, 0, s8
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: s_subb_u32 s5, 0, s9
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
@@ -1972,7 +1946,6 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
@@ -2043,15 +2016,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
-; GISEL-NEXT: s_and_b32 s5, s5, 1
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
@@ -2063,25 +2033,22 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
; GISEL-NEXT: s_sub_u32 s4, 0, s6
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: s_subb_u32 s5, 0, s7
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 25eae693c1634..5d773c3d9c5ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -150,14 +150,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
; GFX8-NEXT: s_add_u32 s0, s8, s2
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
; GFX8-NEXT: s_addc_u32 s1, s9, s2
; GFX8-NEXT: s_add_u32 s8, s10, s12
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: s_mov_b32 s13, s12
; GFX8-NEXT: s_addc_u32 s9, s11, s12
; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
@@ -169,8 +163,7 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_sub_u32 s0, 0, s8
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
+; GFX8-NEXT: s_subb_u32 s1, 0, s9
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
@@ -178,8 +171,6 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: s_subb_u32 s1, 0, s9
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0
@@ -329,14 +320,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX9-NEXT: s_ashr_i32 s2, s9, 31
; GFX9-NEXT: s_ashr_i32 s12, s11, 31
; GFX9-NEXT: s_add_u32 s0, s8, s2
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
; GFX9-NEXT: s_addc_u32 s1, s9, s2
; GFX9-NEXT: s_add_u32 s8, s10, s12
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-NEXT: s_mov_b32 s13, s12
; GFX9-NEXT: s_addc_u32 s9, s11, s12
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
@@ -348,8 +333,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_sub_u32 s0, 0, s8
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
+; GFX9-NEXT: s_subb_u32 s1, 0, s9
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
@@ -357,27 +342,24 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: s_subb_u32 s1, 0, s9
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
-; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2
-; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2
-; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -499,27 +481,18 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX10-NEXT: s_ashr_i32 s2, s9, 31
; GFX10-NEXT: s_ashr_i32 s12, s11, 31
; GFX10-NEXT: s_add_u32 s0, s8, s2
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_mov_b32 s13, s12
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s1, s9, s2
; GFX10-NEXT: s_add_u32 s8, s10, s12
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_mov_b32 s13, s12
; GFX10-NEXT: s_addc_u32 s9, s11, s12
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_mov_b32 s3, s2
; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8
; GFX10-NEXT: s_sub_u32 s10, 0, s8
-; GFX10-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10-NEXT: s_and_b32 s11, s11, 1
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: s_subb_u32 s11, 0, s9
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1335,14 +1308,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_ashr_i32 s6, s13, 31
; GFX8-NEXT: s_add_u32 s0, s8, s2
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
; GFX8-NEXT: s_addc_u32 s1, s9, s2
; GFX8-NEXT: s_add_u32 s8, s12, s6
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: s_mov_b32 s7, s6
; GFX8-NEXT: s_addc_u32 s9, s13, s6
; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
@@ -1354,8 +1321,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_sub_u32 s0, 0, s8
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
+; GFX8-NEXT: s_subb_u32 s1, 0, s9
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
@@ -1363,8 +1329,6 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: s_subb_u32 s1, 0, s9
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0
@@ -1496,14 +1460,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: s_add_u32 s0, s10, s6
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
; GFX8-NEXT: s_addc_u32 s1, s11, s6
; GFX8-NEXT: s_add_u32 s10, s14, s8
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: s_mov_b32 s9, s8
; GFX8-NEXT: s_addc_u32 s11, s15, s8
; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9]
@@ -1516,8 +1474,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GFX8-NEXT: s_sub_u32 s0, 0, s10
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
+; GFX8-NEXT: s_subb_u32 s1, 0, s11
+; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX8-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GFX8-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
; GFX8-NEXT: v_trunc_f32_e32 v6, v6
@@ -1525,17 +1483,14 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_add_f32_e32 v4, v7, v4
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: s_subb_u32 s1, 0, s11
+; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_mul_lo_u32 v4, s1, v7
; GFX8-NEXT: v_mul_lo_u32 v8, s0, v6
; GFX8-NEXT: v_mul_hi_u32 v10, s0, v7
; GFX8-NEXT: v_mul_lo_u32 v9, s0, v7
-; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v10
-; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_mul_lo_u32 v10, v6, v9
; GFX8-NEXT: v_mul_lo_u32 v11, v7, v8
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s2, v3
@@ -1683,14 +1638,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: s_ashr_i32 s2, s9, 31
; GFX9-NEXT: s_ashr_i32 s6, s13, 31
; GFX9-NEXT: s_add_u32 s0, s8, s2
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
; GFX9-NEXT: s_addc_u32 s1, s9, s2
; GFX9-NEXT: s_add_u32 s8, s12, s6
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-NEXT: s_mov_b32 s7, s6
; GFX9-NEXT: s_addc_u32 s9, s13, s6
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
@@ -1702,8 +1651,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_sub_u32 s0, 0, s8
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
+; GFX9-NEXT: s_subb_u32 s1, 0, s9
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
@@ -1711,27 +1659,24 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: s_subb_u32 s1, 0, s9
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v7, s13
; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
-; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2
-; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2
+; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2
-; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -1745,6 +1690,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2
@@ -1826,14 +1772,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: s_ashr_i32 s6, s11, 31
; GFX9-NEXT: s_ashr_i32 s8, s15, 31
; GFX9-NEXT: s_add_u32 s12, s10, s6
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-NEXT: s_addc_u32 s13, s11, s6
; GFX9-NEXT: s_add_u32 s10, s14, s8
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_addc_u32 s11, s15, s8
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
@@ -1858,14 +1798,11 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_add_f32_e32 v4, v6, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: s_cselect_b32 s14, 1, 0
-; GFX9-NEXT: s_and_b32 s14, s14, 1
-; GFX9-NEXT: s_cmp_lg_u32 s14, 0
; GFX9-NEXT: s_subb_u32 s14, 0, s11
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4
; GFX9-NEXT: v_mul_lo_u32 v7, s3, v5
; GFX9-NEXT: v_mul_hi_u32 v8, s3, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX9-NEXT: v_mul_lo_u32 v9, s3, v4
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
@@ -2015,321 +1952,303 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX10-NEXT: s_ashr_i32 s2, s9, 31
; GFX10-NEXT: s_ashr_i32 s6, s13, 31
; GFX10-NEXT: s_add_u32 s0, s8, s2
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_mov_b32 s7, s6
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s1, s9, s2
; GFX10-NEXT: s_add_u32 s8, s12, s6
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_mov_b32 s7, s6
; GFX10-NEXT: s_addc_u32 s9, s13, s6
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_mov_b32 s3, s2
; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX10-NEXT: s_sub_u32 s20, 0, s8
-; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT: s_and_b32 s12, s12, 1
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_subb_u32 s21, 0, s9
; GFX10-NEXT: s_ashr_i32 s12, s11, 31
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX10-NEXT: s_xor_b64 s[18:19], s[2:3], s[6:7]
; GFX10-NEXT: s_ashr_i32 s16, s15, 31
-; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
; GFX10-NEXT: s_add_u32 s6, s10, s12
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_mov_b32 s17, s16
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: s_mov_b32 s13, s12
; GFX10-NEXT: s_addc_u32 s7, s11, s12
; GFX10-NEXT: s_add_u32 s10, s14, s16
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX10-NEXT: s_mov_b32 s17, s16
; GFX10-NEXT: s_addc_u32 s11, s15, s16
-; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13]
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT: s_mov_b32 s13, s12
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s10
+; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s10
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13]
; GFX10-NEXT: s_sub_u32 s3, 0, s10
-; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX10-NEXT: s_subb_u32 s6, 0, s11
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
-; GFX10-NEXT: s_and_b32 s6, s6, 1
-; GFX10-NEXT: s_cmp_lg_u32 s6, 0
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT: s_subb_u32 s6, 0, s11
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v1
; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT: v_mul_lo_u32 v3, s20, v2
+; GFX10-NEXT: v_mul_lo_u32 v5, s20, v2
+; GFX10-NEXT: v_trunc_f32_e32 v3, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX10-NEXT: v_mul_lo_u32 v4, s21, v0
-; GFX10-NEXT: v_mul_hi_u32 v5, s20, v0
-; GFX10-NEXT: v_mul_lo_u32 v6, s20, v0
-; GFX10-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1
-; GFX10-NEXT: v_add3_u32 v3, v4, v3, v5
-; GFX10-NEXT: v_trunc_f32_e32 v4, v7
-; GFX10-NEXT: v_mul_lo_u32 v5, v2, v6
-; GFX10-NEXT: v_mul_hi_u32 v7, v0, v6
-; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6
-; GFX10-NEXT: v_mul_lo_u32 v8, v0, v3
-; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3
-; GFX10-NEXT: v_mul_f32_e32 v9, 0xcf800000, v4
-; GFX10-NEXT: v_mul_hi_u32 v11, v0, v3
-; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3
-; GFX10-NEXT: v_add_f32_e32 v1, v9, v1
-; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7
-; GFX10-NEXT: v_add_co_u32 v6, s7, v10, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7
-; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v7
+; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3
+; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_lo_u32 v6, s21, v0
+; GFX10-NEXT: v_mul_hi_u32 v7, s20, v0
+; GFX10-NEXT: v_add_f32_e32 v1, v4, v1
+; GFX10-NEXT: v_mul_lo_u32 v4, s20, v0
+; GFX10-NEXT: v_mul_lo_u32 v8, s3, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7
-; GFX10-NEXT: v_mul_lo_u32 v9, s3, v4
+; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7
+; GFX10-NEXT: v_mul_lo_u32 v6, v2, v4
+; GFX10-NEXT: v_mul_lo_u32 v7, s6, v1
+; GFX10-NEXT: v_mul_hi_u32 v9, s3, v1
+; GFX10-NEXT: v_mul_lo_u32 v12, v0, v5
+; GFX10-NEXT: v_mul_hi_u32 v11, v0, v4
+; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4
+; GFX10-NEXT: v_mul_lo_u32 v13, v2, v5
+; GFX10-NEXT: v_mul_lo_u32 v10, s3, v1
+; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5
+; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v8, v9
+; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7
+; GFX10-NEXT: v_add_co_u32 v4, s7, v13, v4
+; GFX10-NEXT: v_mul_lo_u32 v8, v3, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7
+; GFX10-NEXT: v_mul_lo_u32 v15, v1, v7
; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v11
-; GFX10-NEXT: v_mul_lo_u32 v12, s6, v1
-; GFX10-NEXT: v_mul_hi_u32 v13, s3, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7
-; GFX10-NEXT: v_mul_lo_u32 v11, s3, v1
-; GFX10-NEXT: v_add_co_u32 v5, s7, v6, v5
-; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7
+; GFX10-NEXT: v_mul_hi_u32 v9, v1, v10
+; GFX10-NEXT: v_mul_hi_u32 v10, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7
-; GFX10-NEXT: v_add3_u32 v8, v12, v9, v13
-; GFX10-NEXT: v_mul_lo_u32 v9, v4, v11
-; GFX10-NEXT: v_mul_hi_u32 v10, v1, v11
-; GFX10-NEXT: v_mul_hi_u32 v11, v4, v11
-; GFX10-NEXT: v_add3_u32 v3, v7, v6, v3
-; GFX10-NEXT: v_mul_lo_u32 v6, v1, v8
-; GFX10-NEXT: v_mul_lo_u32 v7, v4, v8
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
-; GFX10-NEXT: v_mul_hi_u32 v5, v1, v8
-; GFX10-NEXT: v_mul_lo_u32 v12, s21, v0
-; GFX10-NEXT: v_add_co_u32 v6, s7, v9, v6
-; GFX10-NEXT: v_mul_hi_u32 v13, s20, v0
-; GFX10-NEXT: v_mul_lo_u32 v14, s20, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s7
-; GFX10-NEXT: v_add_co_u32 v7, s7, v7, v11
+; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v14
+; GFX10-NEXT: v_mul_lo_u32 v14, v3, v7
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s7
-; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v10
-; GFX10-NEXT: v_mul_lo_u32 v3, s20, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v6
+; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v15
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7
+; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11
+; GFX10-NEXT: v_mul_hi_u32 v16, v1, v7
+; GFX10-NEXT: v_add_co_u32 v10, s7, v14, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7
+; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v6
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7
-; GFX10-NEXT: v_add_co_u32 v5, s7, v7, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7
-; GFX10-NEXT: v_add3_u32 v12, v12, v14, v13
-; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6
-; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8
-; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3
-; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7
-; GFX10-NEXT: v_mul_lo_u32 v11, v0, v12
-; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v6
-; GFX10-NEXT: v_mul_hi_u32 v9, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7
-; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3
-; GFX10-NEXT: v_mul_lo_u32 v13, v2, v12
-; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5
-; GFX10-NEXT: v_add_co_u32 v5, s7, v10, v11
-; GFX10-NEXT: v_add3_u32 v6, v7, v6, v8
-; GFX10-NEXT: v_mul_hi_u32 v14, v0, v12
-; GFX10-NEXT: v_mul_lo_u32 v10, s6, v1
-; GFX10-NEXT: v_add_co_u32 v5, s6, v5, v9
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7
-; GFX10-NEXT: v_add_co_u32 v3, s7, v13, v3
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6
-; GFX10-NEXT: v_mul_hi_u32 v11, s3, v1
-; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v14
-; GFX10-NEXT: v_mul_lo_u32 v13, s3, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v5
+; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v9
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s6
-; GFX10-NEXT: v_mul_hi_u32 v7, v2, v12
-; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1
-; GFX10-NEXT: v_add_co_u32 v3, s3, v3, v5
-; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v9
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT: v_add3_u32 v9, v10, v13, v11
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT: v_mul_lo_u32 v10, v4, v6
-; GFX10-NEXT: v_add3_u32 v5, v8, v5, v7
-; GFX10-NEXT: v_mul_lo_u32 v7, v1, v9
-; GFX10-NEXT: v_mul_hi_u32 v11, v1, v6
-; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6
-; GFX10-NEXT: v_mul_lo_u32 v8, v4, v9
+; GFX10-NEXT: v_add_co_u32 v9, s7, v10, v16
+; GFX10-NEXT: v_add3_u32 v5, v11, v6, v5
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT: v_mul_hi_u32 v3, v1, v9
-; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0
-; GFX10-NEXT: v_add_co_u32 v7, s3, v10, v7
-; GFX10-NEXT: v_mul_lo_u32 v13, s0, v2
+; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7
+; GFX10-NEXT: v_add_co_u32 v4, s7, v9, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v10
+; GFX10-NEXT: v_mul_lo_u32 v5, s20, v0
+; GFX10-NEXT: v_mul_lo_u32 v9, s21, v0
+; GFX10-NEXT: v_mul_hi_u32 v10, s20, v0
+; GFX10-NEXT: v_mul_lo_u32 v11, s20, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7
+; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4
+; GFX10-NEXT: v_mul_hi_u32 v4, v2, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v8, v7
+; GFX10-NEXT: v_mul_lo_u32 v7, v2, v5
+; GFX10-NEXT: v_mul_hi_u32 v8, v0, v5
+; GFX10-NEXT: v_add3_u32 v5, v9, v11, v10
+; GFX10-NEXT: v_mul_lo_u32 v9, s6, v1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT: v_mul_hi_u32 v10, s3, v1
+; GFX10-NEXT: v_mul_lo_u32 v12, v0, v5
+; GFX10-NEXT: v_mul_lo_u32 v13, v2, v5
+; GFX10-NEXT: v_mul_lo_u32 v11, s3, v3
+; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1
+; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5
+; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
+; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v12
+; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v6, s3, v8, v6
-; GFX10-NEXT: v_mul_hi_u32 v12, s0, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0
-; GFX10-NEXT: v_mul_lo_u32 v14, s1, v2
+; GFX10-NEXT: v_add_co_u32 v4, s3, v13, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v14
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v11
+; GFX10-NEXT: v_mul_lo_u32 v15, v3, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7
+; GFX10-NEXT: v_mul_lo_u32 v12, v1, v9
+; GFX10-NEXT: v_mul_hi_u32 v16, v1, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v8, v11, v8
+; GFX10-NEXT: v_mul_hi_u32 v6, v3, v6
+; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v7
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v3, s3, v6, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v5, s3, v5, v13
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v0, s3, v14, v0
-; GFX10-NEXT: v_mul_hi_u32 v15, s0, v2
+; GFX10-NEXT: v_mul_lo_u32 v13, v3, v9
+; GFX10-NEXT: v_mul_hi_u32 v10, v1, v9
+; GFX10-NEXT: v_add_co_u32 v11, s3, v15, v12
+; GFX10-NEXT: v_add3_u32 v5, v8, v7, v5
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v6, s3, v13, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v5, s3, v5, v12
+; GFX10-NEXT: v_add_co_u32 v7, s3, v11, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v4, s3, v6, v10
+; GFX10-NEXT: v_mul_lo_u32 v6, s1, v0
+; GFX10-NEXT: v_mul_lo_u32 v8, s0, v2
+; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT: v_mul_lo_u32 v11, s1, v2
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
+; GFX10-NEXT: v_add_nc_u32_e32 v7, v12, v7
+; GFX10-NEXT: v_mul_hi_u32 v12, s0, v2
+; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9
+; GFX10-NEXT: v_add_co_u32 v6, s3, v6, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v10, s3, v11, v10
+; GFX10-NEXT: v_add_co_u32 v0, s6, v6, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v10, s3, v10, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0
+; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v7
; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2
-; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7
-; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6
-; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v15
-; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3
-; GFX10-NEXT: v_mul_hi_u32 v9, v4, v9
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v5
-; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v12
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT: v_add_co_u32 v3, s3, v3, v7
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
-; GFX10-NEXT: v_add3_u32 v2, v8, v5, v2
-; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
-; GFX10-NEXT: v_add3_u32 v5, v6, v7, v9
-; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0
-; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0
-; GFX10-NEXT: v_mul_lo_u32 v9, s8, v2
-; GFX10-NEXT: v_mul_lo_u32 v3, s8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
-; GFX10-NEXT: v_mul_lo_u32 v5, s15, v1
-; GFX10-NEXT: v_mul_hi_u32 v10, s15, v1
-; GFX10-NEXT: v_mul_hi_u32 v1, s14, v1
-; GFX10-NEXT: v_mul_hi_u32 v17, s14, v4
-; GFX10-NEXT: v_add3_u32 v6, v6, v9, v7
-; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, s0, v3
-; GFX10-NEXT: v_mul_lo_u32 v7, s14, v4
-; GFX10-NEXT: v_mul_lo_u32 v9, s15, v4
-; GFX10-NEXT: v_sub_nc_u32_e32 v11, s1, v6
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s0, s1, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v3
-; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v3, s8
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v6
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v12, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v15
-; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7
+; GFX10-NEXT: v_add_co_u32 v0, s3, v10, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v5, v13, v5
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v11
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
+; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v7, v9
+; GFX10-NEXT: v_mul_lo_u32 v4, s9, v0
+; GFX10-NEXT: v_add3_u32 v2, v6, v8, v2
+; GFX10-NEXT: v_mul_lo_u32 v7, s15, v1
+; GFX10-NEXT: v_mul_lo_u32 v8, s8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX10-NEXT: v_mul_hi_u32 v5, s8, v0
+; GFX10-NEXT: v_mul_lo_u32 v6, s8, v2
+; GFX10-NEXT: v_mul_hi_u32 v9, s14, v1
+; GFX10-NEXT: v_mul_lo_u32 v11, s14, v3
+; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1
+; GFX10-NEXT: v_mul_lo_u32 v12, s15, v3
+; GFX10-NEXT: v_mul_hi_u32 v13, s14, v3
+; GFX10-NEXT: v_mul_hi_u32 v3, s15, v3
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT: v_add3_u32 v4, v4, v6, v5
+; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s0, v8
+; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v11
+; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v4
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v10
-; GFX10-NEXT: v_add_co_u32 v1, s1, v5, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v17
-; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v4
+; GFX10-NEXT: v_add_co_u32 v6, s1, v6, v9
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s8
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v7, v6
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v10, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v14, v11, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v15
+; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0
+; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13
+; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v13, s0, v0, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v7, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v15
-; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s0
-; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v16
+; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v17, v14, s0
+; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v10, s0, v17, 1
-; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s0, 0, v18, s0
+; GFX10-NEXT: v_add_co_u32 v12, s0, v13, 1
+; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v18, s0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT: v_add3_u32 v4, v5, v1, v4
-; GFX10-NEXT: v_sub_co_u32 v1, s0, v14, s8
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v11, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v18, v13, vcc_lo
-; GFX10-NEXT: v_mul_lo_u32 v13, s11, v9
-; GFX10-NEXT: v_mul_lo_u32 v16, s10, v4
-; GFX10-NEXT: v_mul_hi_u32 v17, s10, v9
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT: v_add3_u32 v3, v9, v1, v3
+; GFX10-NEXT: v_sub_co_u32 v1, s0, v15, s8
+; GFX10-NEXT: v_mul_hi_u32 v17, s10, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc_lo
+; GFX10-NEXT: v_mul_lo_u32 v13, s11, v6
+; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7
-; GFX10-NEXT: v_mul_lo_u32 v7, s10, v9
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v1, s0
-; GFX10-NEXT: v_add3_u32 v10, v13, v16, v17
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX10-NEXT: v_mul_lo_u32 v7, s10, v6
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX10-NEXT: v_add3_u32 v9, v13, v14, v17
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v8, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v7, s0, s14, v7
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s15, v10, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v10
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s15, v9, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v9
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v11
; GFX10-NEXT: v_xor_b32_e32 v0, s18, v0
; GFX10-NEXT: v_xor_b32_e32 v2, s19, v2
-; GFX10-NEXT: v_xor_b32_e32 v5, s2, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, vcc_lo, s11, v1, s0
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v7, s10
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v10, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v9, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v11
-; GFX10-NEXT: v_xor_b32_e32 v2, s2, v3
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v12, s0
+; GFX10-NEXT: v_xor_b32_e32 v2, s2, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s11, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v12, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v13
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT: v_add_co_u32 v15, s0, v9, 1
-; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v4, s0
+; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1
+; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0
; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s10
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_sub_co_u32 v8, s0, v13, s10
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v4, v15, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v6, s0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v8, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v14, v9, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v9, s2, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v8, s0
; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17]
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s2
-; GFX10-NEXT: v_xor_b32_e32 v2, s0, v9
-; GFX10-NEXT: v_xor_b32_e32 v7, s1, v10
-; GFX10-NEXT: v_xor_b32_e32 v9, s12, v3
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v5, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v10, s12, v6
+; GFX10-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX10-NEXT: v_xor_b32_e32 v3, s1, v3
+; GFX10-NEXT: v_xor_b32_e32 v6, s12, v7
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v9, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v7, s12, v8
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v9, s12
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v10, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s12
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
-; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
; GFX10-NEXT: s_endpgm
%div = sdiv <2 x i64> %x, %y
store <2 x i64> %div, <2 x i64> addrspace(1)* %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 1e95103fd61cb..0cddf3e2c86ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -204,14 +204,8 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
; CHECK-NEXT: s_ashr_i32 s0, s5, 31
; CHECK-NEXT: s_add_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b32 s7, 1, 0
-; CHECK-NEXT: s_and_b32 s7, s7, 1
-; CHECK-NEXT: s_cmp_lg_u32 s7, 0
; CHECK-NEXT: s_addc_u32 s11, s3, s6
; CHECK-NEXT: s_add_u32 s8, s4, s0
-; CHECK-NEXT: s_cselect_b32 s3, 1, 0
-; CHECK-NEXT: s_and_b32 s3, s3, 1
-; CHECK-NEXT: s_cmp_lg_u32 s3, 0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_addc_u32 s9, s5, s0
; CHECK-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1]
@@ -222,21 +216,18 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_sub_u32 s0, 0, s8
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_and_b32 s1, s1, 1
-; CHECK-NEXT: s_cmp_lg_u32 s1, 0
+; CHECK-NEXT: s_subb_u32 s1, 0, s9
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v1, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: s_subb_u32 s1, 0, s9
-; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1
-; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, s0, v1
+; CHECK-NEXT: v_mul_lo_u32 v3, s1, v0
; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0
; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4
; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2
@@ -1174,43 +1165,38 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s10, 0x1000
-; GISEL-NEXT: s_add_u32 s4, s10, 0
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
; GISEL-NEXT: s_mov_b32 s6, 0
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
; GISEL-NEXT: s_sub_u32 s4, 0, s8
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: s_subb_u32 s5, 0, s9
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
@@ -1234,7 +1220,6 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
@@ -1303,16 +1288,13 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v7
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT: s_add_u32 s4, s10, 0
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
-; GISEL-NEXT: s_and_b32 s5, s5, 1
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
@@ -1323,26 +1305,23 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
; GISEL-NEXT: s_sub_u32 s4, 0, s6
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GISEL-NEXT: s_subb_u32 s5, 0, s7
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -1882,43 +1861,38 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
-; GISEL-NEXT: s_add_u32 s4, s10, 0
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
; GISEL-NEXT: s_mov_b32 s6, 0
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
; GISEL-NEXT: s_sub_u32 s4, 0, s8
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: s_subb_u32 s5, 0, s9
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
@@ -1942,7 +1916,6 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
@@ -2011,16 +1984,13 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v7
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT: s_add_u32 s4, s10, 0
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
-; GISEL-NEXT: s_and_b32 s5, s5, 1
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
+; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
@@ -2031,26 +2001,23 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
; GISEL-NEXT: s_sub_u32 s4, 0, s6
-; GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GISEL-NEXT: s_and_b32 s5, s5, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GISEL-NEXT: s_subb_u32 s5, 0, s7
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index f3509f3e80694..38599949d7777 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4203,9 +4203,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_ssubsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s4, s0, s2
-; GFX6-NEXT: s_cselect_b32 s5, 1, 0
-; GFX6-NEXT: s_and_b32 s5, s5, 1
-; GFX6-NEXT: s_cmp_lg_u32 s5, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_subb_u32 s5, s1, s3
; GFX6-NEXT: v_mov_b32_e32 v1, s1
@@ -4229,9 +4226,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-LABEL: s_ssubsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: s_cselect_b32 s5, 1, 0
-; GFX8-NEXT: s_and_b32 s5, s5, 1
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s5, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -4255,9 +4249,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-LABEL: s_ssubsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: s_cselect_b32 s5, 1, 0
-; GFX9-NEXT: s_and_b32 s5, s5, 1
-; GFX9-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s5, s1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -4281,15 +4272,12 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-LABEL: s_ssubsat_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s4, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_subb_u32 s5, s1, s3
-; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
+; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_xor_b32 s2, s2, s1
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
@@ -4545,9 +4533,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-LABEL: s_ssubsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: s_and_b32 s9, s9, 1
-; GFX6-NEXT: s_cmp_lg_u32 s9, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s1
@@ -4558,16 +4543,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX6-NEXT: s_brev_b32 s5, 1
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_addc_u32 s1, s4, s5
-; GFX6-NEXT: s_sub_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: s_and_b32 s1, s1, 1
+; GFX6-NEXT: s_addc_u32 s1, s4, s5
; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: s_sub_u32 s0, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: v_mov_b32_e32 v3, s9
; GFX6-NEXT: s_subb_u32 s1, s3, s7
; GFX6-NEXT: v_mov_b32_e32 v1, s3
@@ -4594,9 +4576,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-LABEL: s_ssubsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_and_b32 s9, s9, 1
-; GFX8-NEXT: s_cmp_lg_u32 s9, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s9, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -4607,16 +4586,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX8-NEXT: s_brev_b32 s5, 1
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
-; GFX8-NEXT: s_addc_u32 s1, s4, s5
-; GFX8-NEXT: s_sub_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s8
-; GFX8-NEXT: s_and_b32 s1, s1, 1
+; GFX8-NEXT: s_addc_u32 s1, s4, s5
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: s_sub_u32 s0, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_subb_u32 s1, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -4643,9 +4619,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-LABEL: s_ssubsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
-; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s9, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -4656,16 +4629,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: s_brev_b32 s5, 1
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_addc_u32 s1, s4, s5
-; GFX9-NEXT: s_sub_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NEXT: s_and_b32 s1, s1, 1
+; GFX9-NEXT: s_addc_u32 s1, s4, s5
; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: s_sub_u32 s0, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_subb_u32 s1, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s3
@@ -4692,32 +4662,26 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-LABEL: s_ssubsat_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s8, s0, s4
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
+; GFX10-NEXT: s_subb_u32 s9, s1, s5
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0
-; GFX10-NEXT: s_and_b32 s9, s9, 1
+; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
; GFX10-NEXT: s_mov_b32 s11, 0
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
+; GFX10-NEXT: s_ashr_i32 s0, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_subb_u32 s9, s1, s5
; GFX10-NEXT: s_brev_b32 s10, 1
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
-; GFX10-NEXT: s_ashr_i32 s0, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_xor_b32 s8, s4, s1
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8
; GFX10-NEXT: s_addc_u32 s1, s0, s10
; GFX10-NEXT: s_sub_u32 s4, s2, s6
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_subb_u32 s5, s3, s7
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
; GFX10-NEXT: v_mov_b32_e32 v3, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
; GFX10-NEXT: s_xor_b32 s2, s3, s2
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2
@@ -4736,19 +4700,10 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_ssubsat_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: s_and_b32 s9, s9, 1
-; GFX6-NEXT: s_cmp_lg_u32 s9, 0
-; GFX6-NEXT: s_subb_u32 s9, s1, s5
-; GFX6-NEXT: s_cselect_b32 s10, 1, 0
-; GFX6-NEXT: s_and_b32 s10, s10, 1
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s10, s2, s6
-; GFX6-NEXT: s_cselect_b32 s11, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: s_and_b32 s11, s11, 1
+; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_cmp_lg_u32 s11, 0
+; GFX6-NEXT: s_subb_u32 s10, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
; GFX6-NEXT: s_subb_u32 s11, s3, s7
@@ -4761,21 +4716,15 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX6-NEXT: s_ashr_i32 s0, s11, 31
; GFX6-NEXT: s_mov_b32 s1, 0
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_addc_u32 s1, s0, 0
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s2, s0, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_and_b32 s3, s3, 1
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
@@ -4800,18 +4749,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-LABEL: s_ssubsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_and_b32 s9, s9, 1
-; GFX8-NEXT: s_cmp_lg_u32 s9, 0
; GFX8-NEXT: s_subb_u32 s9, s1, s5
-; GFX8-NEXT: s_cselect_b32 s10, 1, 0
-; GFX8-NEXT: s_and_b32 s10, s10, 1
-; GFX8-NEXT: s_cmp_lg_u32 s10, 0
-; GFX8-NEXT: s_subb_u32 s10, s2, s6
-; GFX8-NEXT: s_cselect_b32 s11, 1, 0
-; GFX8-NEXT: s_and_b32 s11, s11, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_cmp_lg_u32 s11, 0
+; GFX8-NEXT: s_subb_u32 s10, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_subb_u32 s11, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s2
@@ -4835,17 +4775,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX8-NEXT: s_ashr_i32 s0, s11, 31
; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_addc_u32 s1, s0, 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-NEXT: s_addc_u32 s2, s0, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
@@ -4870,18 +4804,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-LABEL: s_ssubsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
-; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: s_subb_u32 s9, s1, s5
-; GFX9-NEXT: s_cselect_b32 s10, 1, 0
-; GFX9-NEXT: s_and_b32 s10, s10, 1
-; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_subb_u32 s10, s2, s6
-; GFX9-NEXT: s_cselect_b32 s11, 1, 0
-; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_cmp_lg_u32 s11, 0
+; GFX9-NEXT: s_subb_u32 s10, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_subb_u32 s11, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -4905,17 +4830,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX9-NEXT: s_ashr_i32 s0, s11, 31
; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_addc_u32 s1, s0, 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_addc_u32 s2, s0, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
@@ -4940,62 +4859,47 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-LABEL: s_ssubsat_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s8, s0, s4
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_subb_u32 s9, s1, s5
-; GFX10-NEXT: s_cselect_b32 s10, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT: s_and_b32 s10, s10, 1
-; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_subb_u32 s10, s2, s6
-; GFX10-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s11, s11, 1
-; GFX10-NEXT: s_cmp_lg_u32 s11, 0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
; GFX10-NEXT: s_subb_u32 s11, s3, s7
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s11
-; GFX10-NEXT: s_and_b32 s0, 1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[4:5], 0
+; GFX10-NEXT: s_cselect_b32 s12, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
+; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: s_and_b32 s0, 1, s12
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: s_ashr_i32 s0, s11, 31
+; GFX10-NEXT: s_and_b32 s1, 1, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: s_mov_b32 s1, 0
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT: s_addc_u32 s1, s0, 0
-; GFX10-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s9
-; GFX10-NEXT: s_and_b32 s2, s2, 1
+; GFX10-NEXT: v_mov_b32_e32 v3, s11
+; GFX10-NEXT: s_addc_u32 s1, s0, 0
+; GFX10-NEXT: s_addc_u32 s2, s0, 0
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s8
-; GFX10-NEXT: s_addc_u32 s2, s0, 0
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, s10
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s1, v2
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
%result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
@@ -5553,19 +5457,10 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-LABEL: s_ssubsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s16, s0, s8
-; GFX6-NEXT: s_cselect_b32 s17, 1, 0
-; GFX6-NEXT: s_and_b32 s17, s17, 1
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_subb_u32 s17, s1, s9
-; GFX6-NEXT: s_cselect_b32 s18, 1, 0
-; GFX6-NEXT: s_and_b32 s18, s18, 1
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s18, s2, s10
-; GFX6-NEXT: s_cselect_b32 s19, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: s_and_b32 s19, s19, 1
+; GFX6-NEXT: s_subb_u32 s17, s1, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_subb_u32 s18, s2, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
; GFX6-NEXT: s_subb_u32 s19, s3, s11
@@ -5578,51 +5473,36 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX6-NEXT: s_ashr_i32 s0, s19, 31
; GFX6-NEXT: s_mov_b32 s1, 0
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_addc_u32 s1, s0, 0
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s2, s0, 0
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0
-; GFX6-NEXT: s_and_b32 s3, s3, 1
; GFX6-NEXT: s_brev_b32 s8, 1
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT: s_addc_u32 s2, s0, 0
+; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_addc_u32 s3, s0, s8
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: s_sub_u32 s0, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
-; GFX6-NEXT: s_and_b32 s1, s1, 1
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_subb_u32 s1, s5, s13
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
; GFX6-NEXT: v_mov_b32_e32 v3, s16
; GFX6-NEXT: v_mov_b32_e32 v4, s17
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s18
; GFX6-NEXT: v_mov_b32_e32 v3, s19
-; GFX6-NEXT: s_subb_u32 s2, s6, s14
; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_sub_u32 s0, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: s_and_b32 s3, s3, 1
+; GFX6-NEXT: s_subb_u32 s1, s5, s13
; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_subb_u32 s2, s6, s14
; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_subb_u32 s3, s7, s15
@@ -5635,21 +5515,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
; GFX6-NEXT: s_mov_b32 s5, 0
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_addc_u32 s5, s4, 0
-; GFX6-NEXT: s_cselect_b32 s6, 1, 0
-; GFX6-NEXT: s_and_b32 s6, s6, 1
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s6, s4, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: s_cselect_b32 s7, 1, 0
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_and_b32 s7, s7, 1
-; GFX6-NEXT: s_cmp_lg_u32 s7, 0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_addc_u32 s7, s4, s8
; GFX6-NEXT: v_mov_b32_e32 v1, s4
@@ -5678,18 +5552,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-LABEL: s_ssubsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s16, s0, s8
-; GFX8-NEXT: s_cselect_b32 s17, 1, 0
-; GFX8-NEXT: s_and_b32 s17, s17, 1
-; GFX8-NEXT: s_cmp_lg_u32 s17, 0
; GFX8-NEXT: s_subb_u32 s17, s1, s9
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
-; GFX8-NEXT: s_and_b32 s18, s18, 1
-; GFX8-NEXT: s_cmp_lg_u32 s18, 0
-; GFX8-NEXT: s_subb_u32 s18, s2, s10
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_cmp_lg_u32 s19, 0
+; GFX8-NEXT: s_subb_u32 s18, s2, s10
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_subb_u32 s19, s3, s11
; GFX8-NEXT: v_mov_b32_e32 v0, s2
@@ -5713,46 +5578,31 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX8-NEXT: s_ashr_i32 s0, s19, 31
; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_addc_u32 s1, s0, 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
-; GFX8-NEXT: s_addc_u32 s2, s0, 0
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s3, s3, 1
; GFX8-NEXT: s_brev_b32 s8, 1
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: s_addc_u32 s2, s0, 0
+; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_addc_u32 s3, s0, s8
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: s_sub_u32 s0, s4, s12
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_subb_u32 s1, s5, s13
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, s16
; GFX8-NEXT: v_mov_b32_e32 v4, s17
-; GFX8-NEXT: s_subb_u32 s2, s6, s14
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s18
; GFX8-NEXT: v_mov_b32_e32 v3, s19
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_sub_u32 s0, s4, s12
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX8-NEXT: s_and_b32 s3, s3, 1
+; GFX8-NEXT: s_subb_u32 s1, s5, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
+; GFX8-NEXT: s_subb_u32 s2, s6, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_subb_u32 s3, s7, s15
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -5776,17 +5626,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
; GFX8-NEXT: s_mov_b32 s5, 0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
+; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_addc_u32 s5, s4, 0
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: s_and_b32 s6, s6, 1
-; GFX8-NEXT: s_cmp_lg_u32 s6, 0
; GFX8-NEXT: s_addc_u32 s6, s4, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: s_cselect_b32 s7, 1, 0
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_and_b32 s7, s7, 1
-; GFX8-NEXT: s_cmp_lg_u32 s7, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: s_addc_u32 s7, s4, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s4
@@ -5815,18 +5659,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-LABEL: s_ssubsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s16, s0, s8
-; GFX9-NEXT: s_cselect_b32 s17, 1, 0
-; GFX9-NEXT: s_and_b32 s17, s17, 1
-; GFX9-NEXT: s_cmp_lg_u32 s17, 0
; GFX9-NEXT: s_subb_u32 s17, s1, s9
-; GFX9-NEXT: s_cselect_b32 s18, 1, 0
-; GFX9-NEXT: s_and_b32 s18, s18, 1
-; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_subb_u32 s18, s2, s10
-; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_cmp_lg_u32 s19, 0
+; GFX9-NEXT: s_subb_u32 s18, s2, s10
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_subb_u32 s19, s3, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -5850,46 +5685,31 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX9-NEXT: s_ashr_i32 s0, s19, 31
; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_addc_u32 s1, s0, 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_addc_u32 s2, s0, 0
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s3, s3, 1
; GFX9-NEXT: s_brev_b32 s8, 1
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT: s_addc_u32 s2, s0, 0
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_addc_u32 s3, s0, s8
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: s_sub_u32 s0, s4, s12
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_subb_u32 s1, s5, s13
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, s16
; GFX9-NEXT: v_mov_b32_e32 v4, s17
-; GFX9-NEXT: s_subb_u32 s2, s6, s14
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_sub_u32 s0, s4, s12
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
-; GFX9-NEXT: s_and_b32 s3, s3, 1
+; GFX9-NEXT: s_subb_u32 s1, s5, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
+; GFX9-NEXT: s_subb_u32 s2, s6, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_subb_u32 s3, s7, s15
; GFX9-NEXT: v_mov_b32_e32 v0, s6
@@ -5913,17 +5733,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_mov_b32 s5, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
+; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_addc_u32 s5, s4, 0
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: s_and_b32 s6, s6, 1
-; GFX9-NEXT: s_cmp_lg_u32 s6, 0
; GFX9-NEXT: s_addc_u32 s6, s4, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: s_cselect_b32 s7, 1, 0
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_and_b32 s7, s7, 1
-; GFX9-NEXT: s_cmp_lg_u32 s7, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_addc_u32 s7, s4, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -5952,120 +5766,90 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-LABEL: s_ssubsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s16, s0, s8
-; GFX10-NEXT: s_cselect_b32 s17, 1, 0
-; GFX10-NEXT: s_and_b32 s17, s17, 1
-; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_subb_u32 s17, s1, s9
-; GFX10-NEXT: s_cselect_b32 s18, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
-; GFX10-NEXT: s_and_b32 s18, s18, 1
-; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_subb_u32 s18, s2, s10
-; GFX10-NEXT: s_cselect_b32 s19, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s19, s19, 1
-; GFX10-NEXT: s_cmp_lg_u32 s19, 0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
; GFX10-NEXT: s_subb_u32 s19, s3, s11
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT: s_brev_b32 s21, 1
; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
-; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: s_and_b32 s0, 1, s20
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_mov_b32 s10, 0
+; GFX10-NEXT: s_mov_b32 s20, 0
; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: s_ashr_i32 s0, s19, 31
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT: s_cmp_lg_u32 s20, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT: s_cmp_lg_u32 s10, 0
-; GFX10-NEXT: s_brev_b32 s11, 1
; GFX10-NEXT: s_addc_u32 s1, s0, 0
-; GFX10-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT: s_addc_u32 s2, s0, 0
+; GFX10-NEXT: s_addc_u32 s3, s0, s21
+; GFX10-NEXT: s_sub_u32 s8, s4, s12
+; GFX10-NEXT: s_subb_u32 s9, s5, s13
+; GFX10-NEXT: s_subb_u32 s10, s6, s14
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5]
+; GFX10-NEXT: s_subb_u32 s11, s7, s15
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT: s_and_b32 s2, s2, 1
+; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
; GFX10-NEXT: v_mov_b32_e32 v2, s17
-; GFX10-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s19
+; GFX10-NEXT: v_mov_b32_e32 v7, s11
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7]
+; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: s_addc_u32 s2, s0, 0
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s16
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: s_addc_u32 s3, s0, s11
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX10-NEXT: s_sub_u32 s0, s4, s12
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, s18
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
-; GFX10-NEXT: s_subb_u32 s1, s5, s13
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: v_cmp_gt_u64_e64 s4, s[12:13], 0
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT: s_subb_u32 s8, s6, s14
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
-; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[14:15], 0
-; GFX10-NEXT: s_subb_u32 s9, s7, s15
-; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s2, 1, s2
+; GFX10-NEXT: s_cselect_b32 s16, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10-NEXT: s_and_b32 s4, 1, s16
; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_ashr_i32 s2, s9, 31
-; GFX10-NEXT: s_and_b32 s3, 1, s3
-; GFX10-NEXT: s_cmp_lg_u32 s10, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3
-; GFX10-NEXT: s_addc_u32 s3, s2, 0
-; GFX10-NEXT: s_cselect_b32 s4, 1, 0
-; GFX10-NEXT: v_mov_b32_e32 v8, s9
-; GFX10-NEXT: s_and_b32 s4, s4, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc_lo
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-NEXT: v_mov_b32_e32 v6, s1
-; GFX10-NEXT: s_addc_u32 s4, s2, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: v_mov_b32_e32 v5, s0
-; GFX10-NEXT: v_mov_b32_e32 v7, s8
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
-; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT: s_addc_u32 s1, s2, s11
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: v_readfirstlane_b32 s4, v4
-; GFX10-NEXT: v_readfirstlane_b32 s5, v5
-; GFX10-NEXT: v_readfirstlane_b32 s6, v6
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_and_b32 s5, 1, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s18
+; GFX10-NEXT: v_mov_b32_e32 v5, s19
+; GFX10-NEXT: v_mov_b32_e32 v6, s9
+; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
+; GFX10-NEXT: s_ashr_i32 s0, s11, 31
+; GFX10-NEXT: s_cmp_lg_u32 s20, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v5, s8
+; GFX10-NEXT: s_addc_u32 s1, s0, 0
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, s10
+; GFX10-NEXT: s_addc_u32 s2, s0, 0
+; GFX10-NEXT: s_addc_u32 s3, s0, s21
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s1, v2
+; GFX10-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-NEXT: v_readfirstlane_b32 s3, v4
+; GFX10-NEXT: v_readfirstlane_b32 s4, v5
+; GFX10-NEXT: v_readfirstlane_b32 s5, v6
+; GFX10-NEXT: v_readfirstlane_b32 s6, v3
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
; GFX10-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index ea9547e063025..ec59cb495898d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -457,7 +457,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_sub_u32 s0, s0, s1
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
-; GFX7-NEXT: s_and_b32 s1, s1, 1
; GFX7-NEXT: s_sub_i32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
@@ -465,7 +464,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s0, s0, s1
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
; GFX8-NEXT: s_sub_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
@@ -473,7 +471,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s0, s0, s1
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
; GFX9-NEXT: s_sub_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
%usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@@ -487,13 +484,10 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_usubo_i64:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_sub_u32 s4, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: s_cselect_b32 s5, 1, 0
; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_and_b32 s5, s5, 1
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX7-NEXT: s_cmp_lg_u32 s5, 0
+; GFX7-NEXT: s_sub_u32 s4, s0, s2
; GFX7-NEXT: s_subb_u32 s5, s1, s3
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v1, s5
@@ -505,13 +499,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
;
; GFX8-LABEL: s_usubo_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sub_u32 s4, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_cselect_b32 s5, 1, 0
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_and_b32 s5, s5, 1
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
+; GFX8-NEXT: s_sub_u32 s4, s0, s2
; GFX8-NEXT: s_subb_u32 s5, s1, s3
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -523,13 +514,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sub_u32 s4, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_and_b32 s5, s5, 1
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: s_cmp_lg_u32 s5, 0
+; GFX9-NEXT: s_sub_u32 s4, s0, s2
; GFX9-NEXT: s_subb_u32 s5, s1, s3
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, s5
@@ -553,8 +541,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX7-NEXT: s_cselect_b32 s2, 1, 0
; GFX7-NEXT: s_sub_u32 s1, s1, s3
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
-; GFX7-NEXT: s_and_b32 s2, s2, 1
-; GFX7-NEXT: s_and_b32 s3, s3, 1
; GFX7-NEXT: s_sub_i32 s0, s0, s2
; GFX7-NEXT: s_sub_i32 s1, s1, s3
; GFX7-NEXT: ; return to shader part epilog
@@ -565,8 +551,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
; GFX8-NEXT: s_sub_u32 s1, s1, s3
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_and_b32 s3, s3, 1
; GFX8-NEXT: s_sub_i32 s0, s0, s2
; GFX8-NEXT: s_sub_i32 s1, s1, s3
; GFX8-NEXT: ; return to shader part epilog
@@ -577,8 +561,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-NEXT: s_sub_u32 s1, s1, s3
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_and_b32 s3, s3, 1
; GFX9-NEXT: s_sub_i32 s0, s0, s2
; GFX9-NEXT: s_sub_i32 s1, s1, s3
; GFX9-NEXT: ; return to shader part epilog
@@ -728,9 +710,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_ssubo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_sub_u32 s4, s0, s2
-; GFX7-NEXT: s_cselect_b32 s5, 1, 0
-; GFX7-NEXT: s_and_b32 s5, s5, 1
-; GFX7-NEXT: s_cmp_lg_u32 s5, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_subb_u32 s5, s1, s3
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -748,9 +727,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX8-LABEL: s_ssubo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: s_cselect_b32 s5, 1, 0
-; GFX8-NEXT: s_and_b32 s5, s5, 1
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_subb_u32 s5, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -768,9 +744,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX9-LABEL: s_ssubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: s_cselect_b32 s5, 1, 0
-; GFX9-NEXT: s_and_b32 s5, s5, 1
-; GFX9-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_subb_u32 s5, s1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index b07bdeeabd7ce..24284351fc911 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2591,9 +2591,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_uaddsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s2
-; GFX6-NEXT: s_cselect_b32 s4, 1, 0
-; GFX6-NEXT: s_and_b32 s4, s4, 1
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_addc_u32 s1, s1, s3
; GFX6-NEXT: v_mov_b32_e32 v1, s3
@@ -2609,9 +2606,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-LABEL: s_uaddsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_cselect_b32 s4, 1, 0
-; GFX8-NEXT: s_and_b32 s4, s4, 1
-; GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -2627,9 +2621,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-LABEL: s_uaddsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: s_cselect_b32 s4, 1, 0
-; GFX9-NEXT: s_and_b32 s4, s4, 1
-; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_addc_u32 s1, s1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s3
@@ -2645,9 +2636,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-LABEL: s_uaddsat_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s0, s0, s2
-; GFX10-NEXT: s_cselect_b32 s4, 1, 0
-; GFX10-NEXT: s_and_b32 s4, s4, 1
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s3
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2
@@ -2816,20 +2804,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-LABEL: s_uaddsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s4
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: s_and_b32 s8, s8, 1
-; GFX6-NEXT: s_cmp_lg_u32 s8, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_addc_u32 s1, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: s_add_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
-; GFX6-NEXT: s_and_b32 s1, s1, 1
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: s_addc_u32 s1, s3, s7
; GFX6-NEXT: v_mov_b32_e32 v1, s7
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -2848,20 +2830,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-LABEL: s_uaddsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_and_b32 s8, s8, 1
-; GFX8-NEXT: s_cmp_lg_u32 s8, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: s_add_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_addc_u32 s1, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -2880,20 +2856,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-LABEL: s_uaddsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_and_b32 s8, s8, 1
-; GFX9-NEXT: s_cmp_lg_u32 s8, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: s_add_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_addc_u32 s1, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -2912,23 +2882,17 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-LABEL: s_uaddsat_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s0, s0, s4
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s5
; GFX10-NEXT: s_add_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s3, s3, s7
-; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
@@ -2940,19 +2904,10 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_uaddsat_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s4
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: s_and_b32 s8, s8, 1
-; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_addc_u32 s1, s1, s5
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: s_and_b32 s8, s8, 1
-; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_addc_u32 s2, s2, s6
-; GFX6-NEXT: s_cselect_b32 s8, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: s_and_b32 s8, s8, 1
+; GFX6-NEXT: s_addc_u32 s1, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: s_cmp_lg_u32 s8, 0
+; GFX6-NEXT: s_addc_u32 s2, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_addc_u32 s3, s3, s7
@@ -2981,18 +2936,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-LABEL: s_uaddsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_and_b32 s8, s8, 1
-; GFX8-NEXT: s_cmp_lg_u32 s8, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_and_b32 s8, s8, 1
-; GFX8-NEXT: s_cmp_lg_u32 s8, 0
-; GFX8-NEXT: s_addc_u32 s2, s2, s6
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_and_b32 s8, s8, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_cmp_lg_u32 s8, 0
+; GFX8-NEXT: s_addc_u32 s2, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_addc_u32 s3, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -3025,18 +2971,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-LABEL: s_uaddsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_and_b32 s8, s8, 1
-; GFX9-NEXT: s_cmp_lg_u32 s8, 0
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_and_b32 s8, s8, 1
-; GFX9-NEXT: s_cmp_lg_u32 s8, 0
-; GFX9-NEXT: s_addc_u32 s2, s2, s6
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_and_b32 s8, s8, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_cmp_lg_u32 s8, 0
+; GFX9-NEXT: s_addc_u32 s2, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_addc_u32 s3, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s6
@@ -3069,26 +3006,17 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-LABEL: s_uaddsat_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s0, s0, s4
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s5
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
; GFX10-NEXT: s_addc_u32 s3, s3, s7
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s4, 1, 0
-; GFX10-NEXT: s_and_b32 s4, 1, s4
+; GFX10-NEXT: s_cselect_b32 s8, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT: s_and_b32 s4, 1, s8
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -3450,19 +3378,10 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-LABEL: s_uaddsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s8
-; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_and_b32 s16, s16, 1
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_addc_u32 s1, s1, s9
-; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_and_b32 s16, s16, 1
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_addc_u32 s2, s2, s10
-; GFX6-NEXT: s_cselect_b32 s16, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: s_and_b32 s16, s16, 1
+; GFX6-NEXT: s_addc_u32 s1, s1, s9
; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_addc_u32 s2, s2, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s10
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_addc_u32 s3, s3, s11
@@ -3472,30 +3391,21 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: s_add_u32 s0, s4, s12
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
-; GFX6-NEXT: s_and_b32 s1, s1, 1
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_addc_u32 s1, s5, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
-; GFX6-NEXT: s_and_b32 s2, s2, 1
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s2, s6, s14
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_add_u32 s0, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v2, s12
-; GFX6-NEXT: s_and_b32 s3, s3, 1
+; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_addc_u32 s1, s5, s13
; GFX6-NEXT: v_mov_b32_e32 v3, s13
; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_addc_u32 s2, s6, s14
; GFX6-NEXT: v_mov_b32_e32 v0, s14
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_addc_u32 s3, s7, s15
@@ -3528,18 +3438,9 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-LABEL: s_uaddsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s8
-; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_and_b32 s16, s16, 1
-; GFX8-NEXT: s_cmp_lg_u32 s16, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s9
-; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_and_b32 s16, s16, 1
-; GFX8-NEXT: s_cmp_lg_u32 s16, 0
-; GFX8-NEXT: s_addc_u32 s2, s2, s10
-; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_and_b32 s16, s16, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: s_cmp_lg_u32 s16, 0
+; GFX8-NEXT: s_addc_u32 s2, s2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_addc_u32 s3, s3, s11
; GFX8-NEXT: v_mov_b32_e32 v0, s10
@@ -3552,28 +3453,19 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: s_and_b32 s8, 1, s10
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: s_add_u32 s0, s4, s12
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_addc_u32 s1, s5, s13
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
-; GFX8-NEXT: s_addc_u32 s2, s6, s14
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_add_u32 s0, s4, s12
; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX8-NEXT: s_and_b32 s3, s3, 1
+; GFX8-NEXT: s_addc_u32 s1, s5, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_addc_u32 s2, s6, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s13
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
@@ -3612,18 +3504,9 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-LABEL: s_uaddsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s8
-; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_and_b32 s16, s16, 1
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
; GFX9-NEXT: s_addc_u32 s1, s1, s9
-; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_and_b32 s16, s16, 1
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_addc_u32 s2, s2, s10
-; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_and_b32 s16, s16, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
+; GFX9-NEXT: s_addc_u32 s2, s2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_addc_u32 s3, s3, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s10
@@ -3636,28 +3519,19 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: s_and_b32 s8, 1, s10
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: s_add_u32 s0, s4, s12
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_addc_u32 s1, s5, s13
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_addc_u32 s2, s6, s14
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_add_u32 s0, s4, s12
; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX9-NEXT: s_and_b32 s3, s3, 1
+; GFX9-NEXT: s_addc_u32 s1, s5, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s12
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_addc_u32 s2, s6, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s13
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
@@ -3696,69 +3570,51 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-LABEL: s_uaddsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s0, s0, s8
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: s_and_b32 s16, s16, 1
-; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s9
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
-; GFX10-NEXT: s_and_b32 s16, s16, 1
-; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_addc_u32 s2, s2, s10
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8
-; GFX10-NEXT: s_and_b32 s16, s16, 1
-; GFX10-NEXT: s_cmp_lg_u32 s16, 0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
; GFX10-NEXT: s_addc_u32 s3, s3, s11
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX10-NEXT: v_cmp_lt_u64_e64 s10, s[2:3], s[10:11]
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8
+; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8
; GFX10-NEXT: s_and_b32 s8, 1, s16
; GFX10-NEXT: s_add_u32 s4, s4, s12
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s10
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_addc_u32 s5, s5, s13
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13]
; GFX10-NEXT: s_addc_u32 s6, s6, s14
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: s_and_b32 s8, s8, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9
-; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s7, s7, s15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9
; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_and_b32 s8, 1, s8
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s0, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s1, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s2, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, s3, -1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s2, v3
-; GFX10-NEXT: v_readfirstlane_b32 s3, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s5, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s6, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s7, -1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s6, v2
-; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-NEXT: v_readfirstlane_b32 s6, v6
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
; GFX10-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 2ba189ce7b965..3a7625ea9e362 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -194,14 +194,11 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3
; CHECK-NEXT: s_sub_u32 s4, 0, s2
-; CHECK-NEXT: s_cselect_b32 s5, 1, 0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2
-; CHECK-NEXT: s_and_b32 s5, s5, 1
+; CHECK-NEXT: s_subb_u32 s5, 0, s3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT: s_cmp_lg_u32 s5, 0
-; CHECK-NEXT: s_subb_u32 s5, 0, s3
; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v2
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 106ca06a30191..12423fc70269d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -117,13 +117,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
; GFX8-NEXT: s_sub_u32 s0, 0, s10
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
+; GFX8-NEXT: s_subb_u32 s1, 0, s11
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: s_subb_u32 s1, 0, s11
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
@@ -140,19 +137,19 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4
-; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT: v_mul_hi_u32 v8, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -269,13 +266,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10
; GFX9-NEXT: s_sub_u32 s0, 0, s10
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
+; GFX9-NEXT: s_subb_u32 s1, 0, s11
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: s_subb_u32 s1, 0, s11
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
@@ -296,16 +290,16 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v6
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
@@ -412,11 +406,8 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10
; GFX10-NEXT: s_sub_u32 s0, 0, s10
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_subb_u32 s1, 0, s11
+; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1026,13 +1017,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12
; GFX8-NEXT: s_sub_u32 s0, 0, s12
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
+; GFX8-NEXT: s_subb_u32 s1, 0, s13
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: s_subb_u32 s1, 0, s13
+; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_subb_u32 s3, 0, s15
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
@@ -1040,7 +1030,6 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT: s_sub_u32 s2, 0, s14
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0
@@ -1050,19 +1039,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4
-; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT: v_mul_hi_u32 v8, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -1171,23 +1160,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX8-NEXT: v_trunc_f32_e32 v6, v6
; GFX8-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6
; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT: s_cselect_b32 s0, 1, 0
-; GFX8-NEXT: s_and_b32 s0, s0, 1
-; GFX8-NEXT: s_cmp_lg_u32 s0, 0
-; GFX8-NEXT: s_subb_u32 s3, 0, s15
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GFX8-NEXT: v_mul_lo_u32 v7, s3, v3
; GFX8-NEXT: v_mul_lo_u32 v8, s2, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GFX8-NEXT: v_mul_hi_u32 v10, s2, v3
; GFX8-NEXT: v_mul_lo_u32 v9, s2, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8
; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v10
; GFX8-NEXT: v_mul_lo_u32 v8, v6, v9
; GFX8-NEXT: v_mul_lo_u32 v10, v3, v7
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
; GFX8-NEXT: v_mul_hi_u32 v2, v3, v9
; GFX8-NEXT: v_mul_hi_u32 v9, v6, v9
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
@@ -1318,13 +1303,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12
; GFX9-NEXT: s_sub_u32 s0, 0, s12
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
+; GFX9-NEXT: s_subb_u32 s1, 0, s13
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: s_subb_u32 s1, 0, s13
+; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s15
+; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_subb_u32 s3, 0, s15
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
@@ -1332,14 +1317,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s15
-; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
-; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2
@@ -1349,16 +1332,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v5, v6
+; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1455,20 +1438,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX9-NEXT: v_add_f32_e32 v5, v13, v5
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
-; GFX9-NEXT: s_cselect_b32 s0, 1, 0
-; GFX9-NEXT: s_and_b32 s0, s0, 1
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
-; GFX9-NEXT: s_subb_u32 s3, 0, s15
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX9-NEXT: v_mul_lo_u32 v13, s3, v5
; GFX9-NEXT: v_mul_lo_u32 v14, s2, v12
; GFX9-NEXT: v_mul_hi_u32 v16, s2, v5
; GFX9-NEXT: v_mul_lo_u32 v17, s2, v5
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GFX9-NEXT: v_add3_u32 v4, v13, v14, v16
; GFX9-NEXT: v_mul_lo_u32 v9, v12, v17
; GFX9-NEXT: v_mul_lo_u32 v13, v5, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GFX9-NEXT: v_mul_hi_u32 v10, v5, v17
; GFX9-NEXT: v_mul_hi_u32 v14, v12, v17
; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v13
@@ -1600,19 +1579,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX10-NEXT: s_sub_u32 s0, 0, s12
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_and_b32 s1, s1, 1
+; GFX10-NEXT: s_subb_u32 s1, 0, s13
+; GFX10-NEXT: s_sub_u32 s2, 0, s14
+; GFX10-NEXT: s_subb_u32 s3, 0, s15
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_add_f32_e32 v1, v2, v3
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
-; GFX10-NEXT: s_subb_u32 s1, 0, s13
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT: s_sub_u32 s2, 0, s14
-; GFX10-NEXT: s_cselect_b32 s3, 1, 0
-; GFX10-NEXT: s_and_b32 s3, s3, 1
-; GFX10-NEXT: s_cmp_lg_u32 s3, 0
-; GFX10-NEXT: s_subb_u32 s3, 0, s15
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
@@ -1690,174 +1663,174 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
; GFX10-NEXT: v_mul_lo_u32 v11, s2, v3
; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0
; GFX10-NEXT: v_mul_lo_u32 v8, s2, v1
+; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7
; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10
-; GFX10-NEXT: v_mul_lo_u32 v12, v2, v4
+; GFX10-NEXT: v_mul_lo_u32 v13, v2, v4
; GFX10-NEXT: v_mul_lo_u32 v10, v0, v5
-; GFX10-NEXT: v_mul_hi_u32 v13, v0, v4
+; GFX10-NEXT: v_mul_hi_u32 v14, v0, v4
; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4
; GFX10-NEXT: v_mul_lo_u32 v11, v2, v5
; GFX10-NEXT: v_mul_lo_u32 v6, v3, v8
-; GFX10-NEXT: v_mul_lo_u32 v15, v1, v9
+; GFX10-NEXT: v_mul_lo_u32 v16, v1, v9
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v8
; GFX10-NEXT: v_mul_hi_u32 v8, v3, v8
-; GFX10-NEXT: v_add_co_u32 v10, s0, v12, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX10-NEXT: v_mul_lo_u32 v17, v3, v9
+; GFX10-NEXT: v_add_co_u32 v10, s0, v13, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v4, s0, v11, v4
-; GFX10-NEXT: v_mul_lo_u32 v16, v3, v9
+; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v15
-; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v13
+; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v8, s0, v17, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v14
+; GFX10-NEXT: v_mul_hi_u32 v18, v1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v8, s0, v16, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v14
-; GFX10-NEXT: v_add_nc_u32_e32 v10, v12, v10
+; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v15
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10
-; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v14
+; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v10
+; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v18
+; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT: v_mul_hi_u32 v17, v1, v9
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v6, v15, v6
-; GFX10-NEXT: v_add3_u32 v5, v7, v10, v5
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v16, v6
+; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7
+; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v14
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9
-; GFX10-NEXT: v_mul_hi_u32 v10, s9, v0
-; GFX10-NEXT: v_mov_b32_e32 v12, 0
-; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v17
-; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v4, s0, v8, v6
-; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0
-; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v11
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT: v_mul_lo_u32 v11, s9, v2
-; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4
-; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8
-; GFX10-NEXT: v_add3_u32 v5, v7, v5, v9
-; GFX10-NEXT: v_mul_hi_u32 v7, s8, v2
+; GFX10-NEXT: v_add_co_u32 v6, s0, v8, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v10, v17, v10
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v0, s1, v6, v0
-; GFX10-NEXT: v_add_co_u32 v9, s0, v11, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT: v_add3_u32 v5, v11, v7, v5
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_add3_u32 v4, v10, v8, v9
+; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6
+; GFX10-NEXT: v_mul_lo_u32 v5, s9, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX10-NEXT: v_mul_lo_u32 v4, s8, v2
+; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0
+; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2
+; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2
; GFX10-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX10-NEXT: v_add_co_u32 v7, s0, v9, v7
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v0, s0, v7, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v9
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT: v_mul_lo_u32 v8, s10, v3
-; GFX10-NEXT: v_mul_lo_u32 v9, s13, v0
-; GFX10-NEXT: v_mul_hi_u32 v10, s12, v0
-; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2
-; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1
+; GFX10-NEXT: v_mul_hi_u32 v8, s10, v1
+; GFX10-NEXT: v_add_co_u32 v4, s0, v5, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v0, s0, v9, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT: v_mul_lo_u32 v10, s10, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT: v_mul_lo_u32 v4, s11, v3
+; GFX10-NEXT: v_mul_lo_u32 v5, s11, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
+; GFX10-NEXT: v_mul_hi_u32 v11, s10, v3
+; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v10
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v1, s0, v5, v1
+; GFX10-NEXT: v_add3_u32 v2, v7, v4, v2
+; GFX10-NEXT: v_mul_lo_u32 v5, s13, v0
+; GFX10-NEXT: v_mul_hi_u32 v7, s12, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
; GFX10-NEXT: v_mul_lo_u32 v13, s12, v0
-; GFX10-NEXT: v_mul_lo_u32 v11, s12, v2
+; GFX10-NEXT: v_mul_lo_u32 v10, s12, v2
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v11
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT: v_mul_hi_u32 v5, s10, v3
; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3
-; GFX10-NEXT: v_add_co_u32 v1, s0, v4, v1
-; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, s8, v13
-; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v9
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s0, s9, v9, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v10
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v9
-; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v10, s12
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v9
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v11, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v15
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0
+; GFX10-NEXT: v_add3_u32 v5, v5, v10, v7
+; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v13
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v8
+; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6
+; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v5
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v5, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v7
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v11, vcc_lo, v7, s12
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v9, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v11
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v13
+; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: v_add_co_u32 v5, s0, v0, 1
+; GFX10-NEXT: v_add_co_u32 v15, s0, v0, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v2, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v15
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v13
; GFX10-NEXT: v_add3_u32 v3, v4, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v18, s14, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0
-; GFX10-NEXT: v_mul_lo_u32 v13, s15, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v14, v10, s0
+; GFX10-NEXT: v_mul_lo_u32 v14, s15, v6
; GFX10-NEXT: v_mul_lo_u32 v17, s14, v3
-; GFX10-NEXT: v_add_co_u32 v1, s0, v5, 1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT: v_add_co_u32 v1, s0, v15, 1
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v16, s0
-; GFX10-NEXT: v_sub_co_u32 v19, s0, v14, s12
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX10-NEXT: v_mul_lo_u32 v5, s14, v6
+; GFX10-NEXT: v_sub_co_u32 v19, s0, v11, s12
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc_lo
+; GFX10-NEXT: v_mul_lo_u32 v15, s14, v6
; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo
-; GFX10-NEXT: v_add3_u32 v13, v13, v17, v18
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-NEXT: v_add3_u32 v14, v14, v17, v18
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v13
-; GFX10-NEXT: v_sub_co_u32 v11, s0, s10, v5
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v13, s0
+; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v14
+; GFX10-NEXT: v_sub_co_u32 v9, s0, s10, v15
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s1, s11, v14, s0
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v11
-; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v8
-; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0
-; GFX10-NEXT: v_sub_co_u32 v13, s0, v11, s14
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v19, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s2
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v16
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v10
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v15
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v11, v19, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT: v_sub_co_u32 v14, s0, v9, s14
+; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s2
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s2, 0, v2, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v13
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v11, s1
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s1
-; GFX10-NEXT: v_add_co_u32 v15, s1, v6, 1
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v14
+; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s1
+; GFX10-NEXT: v_add_co_u32 v13, s1, v6, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
-; GFX10-NEXT: v_add_co_u32 v10, s1, v15, 1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v16
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s1
+; GFX10-NEXT: v_add_co_u32 v11, s1, v13, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
-; GFX10-NEXT: v_sub_co_u32 v8, s1, v13, s14
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v10
+; GFX10-NEXT: v_sub_co_u32 v10, s1, v14, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v15, v10, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v17, v18, s0
-; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v2, s0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v10, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v8, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v11, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v17, v18, s0
+; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v7
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, v10, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v16, v2, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v11, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v7, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v10, s1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5]
; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index ae9aeb99b258d..27de0ccd4b23a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -191,14 +191,11 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3
; CHECK-NEXT: s_sub_u32 s4, 0, s2
-; CHECK-NEXT: s_cselect_b32 s5, 1, 0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2
-; CHECK-NEXT: s_and_b32 s5, s5, 1
+; CHECK-NEXT: s_subb_u32 s5, 0, s3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT: s_cmp_lg_u32 s5, 0
-; CHECK-NEXT: s_subb_u32 s5, 0, s3
; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v2
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
@@ -1103,226 +1100,220 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s8
; GISEL-NEXT: s_sub_u32 s6, 0, s8
-; GISEL-NEXT: s_cselect_b32 s4, 1, 0
-; GISEL-NEXT: v_madmk_f32 v6, v4, 0x4f800000, v5
-; GISEL-NEXT: s_and_b32 s4, s4, 1
-; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v4
-; GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GISEL-NEXT: v_madmk_f32 v5, v4, 0x4f800000, v6
; GISEL-NEXT: s_subb_u32 s7, 0, 0
; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v5
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
+; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5
; GISEL-NEXT: v_mov_b32_e32 v5, s4
; GISEL-NEXT: v_mov_b32_e32 v4, s5
-; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GISEL-NEXT: s_sub_u32 s9, 0, s8
-; GISEL-NEXT: s_cselect_b32 s4, 1, 0
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT: s_subb_u32 s10, 0, 0
+; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
+; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
+; GISEL-NEXT: v_mov_b32_e32 v10, s4
; GISEL-NEXT: v_trunc_f32_e32 v8, v8
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
-; GISEL-NEXT: s_and_b32 s4, s4, 1
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8
+; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9
+; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, s6, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT: s_cmp_lg_u32 s4, 0
-; GISEL-NEXT: s_subb_u32 s10, 0, 0
-; GISEL-NEXT: v_mul_lo_u32 v11, s9, v9
-; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
-; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000
-; GISEL-NEXT: v_mul_lo_u32 v12, s6, v6
-; GISEL-NEXT: v_mul_lo_u32 v13, s7, v6
-; GISEL-NEXT: v_mul_hi_u32 v14, s6, v6
-; GISEL-NEXT: v_mul_lo_u32 v15, s9, v7
-; GISEL-NEXT: v_mul_lo_u32 v16, s10, v7
-; GISEL-NEXT: v_mul_hi_u32 v17, s9, v7
-; GISEL-NEXT: v_mov_b32_e32 v18, s4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v19, v6, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v15
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v15
-; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17
-; GISEL-NEXT: v_mul_lo_u32 v17, v7, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9
+; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7
+; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7
+; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7
+; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6
+; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6
+; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13
+; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
+; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16
+; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18
+; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11
+; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19
+; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17
+; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7
+; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6
+; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6
+; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6
+; GISEL-NEXT: v_mul_lo_u32 v17, s6, v8
+; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11
+; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v16, v12
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19
+; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT: v_mul_hi_u32 v17, v7, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v15
+; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v16
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, s6, v6
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, s9, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, s10, v7
-; GISEL-NEXT: v_mul_hi_u32 v15, s9, v7
-; GISEL-NEXT: v_mul_lo_u32 v16, s6, v8
-; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10
-; GISEL-NEXT: v_mul_hi_u32 v19, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10
+; GISEL-NEXT: v_mov_b32_e32 v19, s11
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, s9, v9
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v17, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v9, v14
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_mul_hi_u32 v16, v7, v14
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16
-; GISEL-NEXT: v_mov_b32_e32 v19, s11
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v15
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, v0, v8
-; GISEL-NEXT: v_mul_lo_u32 v15, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v16, v0, v8
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8
+; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8
; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_lo_u32 v11, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v13, v2, v9
+; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v16
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v16
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT: v_mul_lo_u32 v12, s8, v6
-; GISEL-NEXT: v_mul_lo_u32 v14, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7
; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6
+; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8
; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v13
-; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v7, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14
+; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6
; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v18, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 805afaad6d3e7..3a1566b63e501 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2460,11 +2460,8 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_usubsat_i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sub_u32 s4, s0, s2
-; GFX6-NEXT: s_cselect_b32 s5, 1, 0
-; GFX6-NEXT: s_and_b32 s5, s5, 1
; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: s_sub_u32 s4, s0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_subb_u32 s5, s1, s3
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
@@ -2478,11 +2475,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
;
; GFX8-LABEL: s_usubsat_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: s_cselect_b32 s5, 1, 0
-; GFX8-NEXT: s_and_b32 s5, s5, 1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_cmp_lg_u32 s5, 0
+; GFX8-NEXT: s_sub_u32 s4, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_subb_u32 s5, s1, s3
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
@@ -2496,11 +2490,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
;
; GFX9-LABEL: s_usubsat_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: s_cselect_b32 s5, 1, 0
-; GFX9-NEXT: s_and_b32 s5, s5, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_cmp_lg_u32 s5, 0
+; GFX9-NEXT: s_sub_u32 s4, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_subb_u32 s5, s1, s3
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
@@ -2515,10 +2506,7 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-LABEL: s_usubsat_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s4, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_subb_u32 s1, s1, s3
; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0
@@ -2685,21 +2673,15 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: s_and_b32 s9, s9, 1
-; GFX6-NEXT: s_cmp_lg_u32 s9, 0
; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: s_subb_u32 s9, s1, s5
+; GFX6-NEXT: s_sub_u32 s8, s0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: s_sub_u32 s0, s2, s6
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
-; GFX6-NEXT: s_and_b32 s1, s1, 1
; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
+; GFX6-NEXT: s_sub_u32 s0, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v1, s7
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
@@ -2717,21 +2699,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
;
; GFX8-LABEL: s_usubsat_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_and_b32 s9, s9, 1
-; GFX8-NEXT: s_cmp_lg_u32 s9, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_subb_u32 s9, s1, s5
+; GFX8-NEXT: s_sub_u32 s8, s0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_subb_u32 s9, s1, s5
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: s_sub_u32 s0, s2, s6
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
+; GFX8-NEXT: s_sub_u32 s0, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
@@ -2749,21 +2725,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
;
; GFX9-LABEL: s_usubsat_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
-; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: s_subb_u32 s9, s1, s5
+; GFX9-NEXT: s_sub_u32 s8, s0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_subb_u32 s9, s1, s5
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: s_sub_u32 s0, s2, s6
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
+; GFX9-NEXT: s_sub_u32 s0, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
@@ -2782,23 +2752,17 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-LABEL: s_usubsat_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s8, s0, s4
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
-; GFX10-NEXT: s_subb_u32 s9, s1, s5
-; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[0:1], s[4:5]
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
+; GFX10-NEXT: s_subb_u32 s1, s1, s5
; GFX10-NEXT: s_sub_u32 s0, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
-; GFX10-NEXT: s_and_b32 s4, s4, 1
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s4
; GFX10-NEXT: s_subb_u32 s1, s3, s7
+; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v2, s0, 0, s2
; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, s2
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
@@ -2809,28 +2773,19 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_usubsat_i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: s_and_b32 s9, s9, 1
; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: s_cmp_lg_u32 s9, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v0, s6
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s10, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: s_and_b32 s10, s10, 1
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s10, s2, s6
+; GFX6-NEXT: s_sub_u32 s8, s0, s4
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_cselect_b32 s11, 1, 0
-; GFX6-NEXT: s_and_b32 s11, s11, 1
+; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: s_cmp_lg_u32 s11, 0
+; GFX6-NEXT: s_subb_u32 s10, s2, s6
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_subb_u32 s11, s3, s7
; GFX6-NEXT: v_mov_b32_e32 v1, s8
@@ -2851,18 +2806,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-LABEL: s_usubsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_and_b32 s9, s9, 1
-; GFX8-NEXT: s_cmp_lg_u32 s9, 0
; GFX8-NEXT: s_subb_u32 s9, s1, s5
-; GFX8-NEXT: s_cselect_b32 s10, 1, 0
-; GFX8-NEXT: s_and_b32 s10, s10, 1
-; GFX8-NEXT: s_cmp_lg_u32 s10, 0
-; GFX8-NEXT: s_subb_u32 s10, s2, s6
-; GFX8-NEXT: s_cselect_b32 s11, 1, 0
-; GFX8-NEXT: s_and_b32 s11, s11, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_cmp_lg_u32 s11, 0
+; GFX8-NEXT: s_subb_u32 s10, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_subb_u32 s11, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -2895,18 +2841,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-LABEL: s_usubsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_and_b32 s9, s9, 1
-; GFX9-NEXT: s_cmp_lg_u32 s9, 0
; GFX9-NEXT: s_subb_u32 s9, s1, s5
-; GFX9-NEXT: s_cselect_b32 s10, 1, 0
-; GFX9-NEXT: s_and_b32 s10, s10, 1
-; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_subb_u32 s10, s2, s6
-; GFX9-NEXT: s_cselect_b32 s11, 1, 0
-; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_cmp_lg_u32 s11, 0
+; GFX9-NEXT: s_subb_u32 s10, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_subb_u32 s11, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s6
@@ -2939,33 +2876,24 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-LABEL: s_usubsat_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s8, s0, s4
-; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5]
-; GFX10-NEXT: s_and_b32 s9, s9, 1
-; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_subb_u32 s9, s1, s5
-; GFX10-NEXT: s_cselect_b32 s10, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s10, s10, 1
-; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: s_subb_u32 s10, s2, s6
-; GFX10-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10-NEXT: s_and_b32 s11, s11, 1
-; GFX10-NEXT: s_cmp_lg_u32 s11, 0
-; GFX10-NEXT: s_subb_u32 s1, s3, s7
+; GFX10-NEXT: s_subb_u32 s11, s3, s7
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: s_and_b32 s0, 1, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[6:7]
+; GFX10-NEXT: s_cselect_b32 s12, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT: s_and_b32 s0, 1, s12
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, s11, 0, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
@@ -3319,61 +3247,43 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sub_u32 s16, s0, s8
-; GFX6-NEXT: s_cselect_b32 s17, 1, 0
-; GFX6-NEXT: s_and_b32 s17, s17, 1
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_subb_u32 s17, s1, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: s_cselect_b32 s18, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_and_b32 s18, s18, 1
; GFX6-NEXT: v_mov_b32_e32 v0, s10
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
; GFX6-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NEXT: s_subb_u32 s18, s2, s10
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_cselect_b32 s19, 1, 0
-; GFX6-NEXT: s_and_b32 s19, s19, 1
+; GFX6-NEXT: s_sub_u32 s16, s0, s8
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_subb_u32 s19, s3, s11
+; GFX6-NEXT: s_subb_u32 s17, s1, s9
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_sub_u32 s0, s4, s12
+; GFX6-NEXT: s_subb_u32 s18, s2, s10
; GFX6-NEXT: v_mov_b32_e32 v2, s17
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: s_cselect_b32 s1, 1, 0
+; GFX6-NEXT: s_subb_u32 s19, s3, s11
; GFX6-NEXT: v_mov_b32_e32 v1, s16
; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX6-NEXT: s_and_b32 s1, s1, 1
; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mov_b32_e32 v1, s19
-; GFX6-NEXT: s_cmp_lg_u32 s1, 0
; GFX6-NEXT: v_mov_b32_e32 v3, s13
; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX6-NEXT: s_subb_u32 s1, s5, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s14
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX6-NEXT: s_cselect_b32 s2, 1, 0
; GFX6-NEXT: v_mov_b32_e32 v1, s15
-; GFX6-NEXT: s_and_b32 s2, s2, 1
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_subb_u32 s2, s6, s14
+; GFX6-NEXT: s_sub_u32 s0, s4, s12
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT: s_cselect_b32 s3, 1, 0
-; GFX6-NEXT: s_and_b32 s3, s3, 1
+; GFX6-NEXT: s_subb_u32 s1, s5, s13
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_subb_u32 s2, s6, s14
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-NEXT: s_subb_u32 s3, s7, s15
; GFX6-NEXT: v_mov_b32_e32 v1, s0
@@ -3398,18 +3308,9 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-LABEL: s_usubsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s16, s0, s8
-; GFX8-NEXT: s_cselect_b32 s17, 1, 0
-; GFX8-NEXT: s_and_b32 s17, s17, 1
-; GFX8-NEXT: s_cmp_lg_u32 s17, 0
; GFX8-NEXT: s_subb_u32 s17, s1, s9
-; GFX8-NEXT: s_cselect_b32 s18, 1, 0
-; GFX8-NEXT: s_and_b32 s18, s18, 1
-; GFX8-NEXT: s_cmp_lg_u32 s18, 0
-; GFX8-NEXT: s_subb_u32 s18, s2, s10
-; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: s_cmp_lg_u32 s19, 0
+; GFX8-NEXT: s_subb_u32 s18, s2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_subb_u32 s19, s3, s11
; GFX8-NEXT: v_mov_b32_e32 v0, s10
@@ -3422,28 +3323,19 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: s_and_b32 s0, 1, s10
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: s_sub_u32 s0, s4, s12
-; GFX8-NEXT: s_cselect_b32 s1, 1, 0
-; GFX8-NEXT: s_and_b32 s1, s1, 1
-; GFX8-NEXT: s_cmp_lg_u32 s1, 0
-; GFX8-NEXT: s_subb_u32 s1, s5, s13
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: s_and_b32 s2, s2, 1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_subb_u32 s2, s6, s14
; GFX8-NEXT: v_mov_b32_e32 v2, s17
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_sub_u32 s0, s4, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s16
; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX8-NEXT: s_and_b32 s3, s3, 1
+; GFX8-NEXT: s_subb_u32 s1, s5, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s12
; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NEXT: s_cmp_lg_u32 s3, 0
+; GFX8-NEXT: s_subb_u32 s2, s6, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s13
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
@@ -3482,18 +3374,9 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-LABEL: s_usubsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s16, s0, s8
-; GFX9-NEXT: s_cselect_b32 s17, 1, 0
-; GFX9-NEXT: s_and_b32 s17, s17, 1
-; GFX9-NEXT: s_cmp_lg_u32 s17, 0
; GFX9-NEXT: s_subb_u32 s17, s1, s9
-; GFX9-NEXT: s_cselect_b32 s18, 1, 0
-; GFX9-NEXT: s_and_b32 s18, s18, 1
-; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_subb_u32 s18, s2, s10
-; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: s_cmp_lg_u32 s19, 0
+; GFX9-NEXT: s_subb_u32 s18, s2, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_subb_u32 s19, s3, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s10
@@ -3506,28 +3389,19 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: s_and_b32 s0, 1, s10
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: s_sub_u32 s0, s4, s12
-; GFX9-NEXT: s_cselect_b32 s1, 1, 0
-; GFX9-NEXT: s_and_b32 s1, s1, 1
-; GFX9-NEXT: s_cmp_lg_u32 s1, 0
-; GFX9-NEXT: s_subb_u32 s1, s5, s13
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: s_and_b32 s2, s2, 1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_subb_u32 s2, s6, s14
; GFX9-NEXT: v_mov_b32_e32 v2, s17
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_sub_u32 s0, s4, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s16
; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX9-NEXT: s_and_b32 s3, s3, 1
+; GFX9-NEXT: s_subb_u32 s1, s5, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s12
; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s18
; GFX9-NEXT: v_mov_b32_e32 v1, s19
-; GFX9-NEXT: s_cmp_lg_u32 s3, 0
+; GFX9-NEXT: s_subb_u32 s2, s6, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s13
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
@@ -3566,69 +3440,51 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-LABEL: s_usubsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s16, s0, s8
-; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
-; GFX10-NEXT: s_and_b32 s17, s17, 1
-; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_subb_u32 s17, s1, s9
-; GFX10-NEXT: s_cselect_b32 s18, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s18, s18, 1
-; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_subb_u32 s18, s2, s10
-; GFX10-NEXT: s_cselect_b32 s19, 1, 0
-; GFX10-NEXT: s_and_b32 s19, s19, 1
-; GFX10-NEXT: s_cmp_lg_u32 s19, 0
; GFX10-NEXT: s_subb_u32 s19, s3, s11
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[10:11]
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11]
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: s_and_b32 s0, 1, s20
-; GFX10-NEXT: s_sub_u32 s8, s4, s12
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10-NEXT: s_sub_u32 s2, s4, s12
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13]
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
-; GFX10-NEXT: s_subb_u32 s3, s5, s13
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: s_and_b32 s1, s1, 1
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13]
-; GFX10-NEXT: s_subb_u32 s10, s6, s14
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: s_and_b32 s0, s0, 1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[6:7], s[14:15]
-; GFX10-NEXT: s_subb_u32 s9, s7, s15
+; GFX10-NEXT: s_subb_u32 s1, s5, s13
+; GFX10-NEXT: s_subb_u32 s8, s6, s14
+; GFX10-NEXT: s_subb_u32 s3, s7, s15
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15]
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
; GFX10-NEXT: s_and_b32 s0, 1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s16, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s17, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s18, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, s19, 0, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s2, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s3, v4
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s6, v2
-; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-NEXT: v_readfirstlane_b32 s6, v6
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
; GFX10-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index a03df7c7cb7b7..89b3900dc2880 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -190,9 +190,6 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GCN-NEXT: s_not_b64 s[4:5], s[2:3]
; GCN-NEXT: s_add_u32 s2, s2, s0
-; GCN-NEXT: s_cselect_b32 s0, 1, 0
-; GCN-NEXT: s_and_b32 s0, s0, 1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s3, s3, s1
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
@@ -203,11 +200,8 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
; GFX10-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
; GFX10-NEXT: s_not_b64 s[4:5], s[2:3]
; GFX10-NEXT: s_add_u32 s2, s2, s0
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: s_and_b32 s0, s0, 1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
-; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: s_addc_u32 s3, s3, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: ; return to shader part epilog
%xor = xor i64 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index ab8648f198853..44e8b5704b04e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1616,9 +1616,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1635,9 +1632,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1710,9 +1704,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1729,9 +1720,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1804,9 +1792,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1823,9 +1808,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1902,9 +1884,6 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
@@ -1922,9 +1901,6 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
-; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index ea9b5628b02fd..bd0c2b30eb5de 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -283,14 +283,8 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
; GISEL-NEXT: s_add_u32 s2, s4, s6
-; GISEL-NEXT: s_cselect_b32 s3, 1, 0
-; GISEL-NEXT: s_and_b32 s3, s3, 1
-; GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GISEL-NEXT: s_addc_u32 s3, s5, s7
; GISEL-NEXT: s_add_u32 s0, s2, s0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_and_b32 s2, s2, 1
-; GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GISEL-NEXT: s_addc_u32 s1, s3, s1
; GISEL-NEXT: ; return to shader part epilog
%and = and i64 %b, 63
@@ -322,14 +316,8 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
; GISEL-NEXT: s_add_u32 s2, s4, s6
-; GISEL-NEXT: s_cselect_b32 s3, 1, 0
-; GISEL-NEXT: s_and_b32 s3, s3, 1
-; GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GISEL-NEXT: s_addc_u32 s3, s5, s7
; GISEL-NEXT: s_add_u32 s0, s2, s0
-; GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GISEL-NEXT: s_and_b32 s2, s2, 1
-; GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GISEL-NEXT: s_addc_u32 s1, s3, s1
; GISEL-NEXT: ; return to shader part epilog
%and = and i64 %b, 255
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index 7a2fc0ff1d51d..7eaeb32e461ba 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -1972,3 +1972,24 @@ TEST_F(AMDGPUGISelMITest, TestKnownBitsAssertAlign) {
CheckBits(30, Copies.size() - 2);
CheckBits(5, Copies.size() - 1);
}
+
+TEST_F(AArch64GISelMITest, TestKnownBitsUADDO) {
+ StringRef MIRString = R"(
+ %ptr:_(p0) = G_IMPLICIT_DEF
+ %ld0:_(s32) = G_LOAD %ptr(p0) :: (load (s16))
+ %ld1:_(s32) = G_LOAD %ptr(p0) :: (load (s16))
+
+ %add:_(s32), %overflow:_(s32) = G_UADDO %ld0, %ld1
+ %copy_overflow:_(s32) = COPY %overflow
+)";
+
+ setUp(MIRString);
+ if (!TM)
+ return;
+
+ Register CopyOverflow = Copies[Copies.size() - 1];
+ GISelKnownBits Info(*MF);
+ KnownBits Res = Info.getKnownBits(CopyOverflow);
+ EXPECT_EQ(0u, Res.One.getZExtValue());
+ EXPECT_EQ(31u, Res.Zero.countLeadingOnes());
+}
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
index dc915d5f5e216..bddeb1342f0d9 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
@@ -1527,3 +1527,24 @@ TEST_F(AArch64GISelMITest, TestKnownBitsVectorAssertZext) {
EXPECT_EQ(0u, Res.One.getZExtValue());
EXPECT_EQ(0xFFFFFFFFFFFFFFF8u, Res.Zero.getZExtValue());
}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsUAddoOverflow) {
+ StringRef MIRString = R"(
+ %copy_x0:_(s64) = COPY $x0
+ %copy_x1:_(s64) = COPY $x1
+ %x0_x1:_(<2 x s64>) = G_BUILD_VECTOR %copy_x0, %copy_x1
+ %uaddo:_(<2 x s64>), %overflow:_(<2 x s32>) = G_UADDO %x0_x1, %x0_x1
+ %result:_(<2 x s32>) = COPY %overflow
+)";
+
+ setUp(MIRString);
+ if (!TM)
+ return;
+
+ Register CopyOverflow = Copies[Copies.size() - 1];
+
+ GISelKnownBits Info(*MF);
+
+ // Assert sign-extension from vector boolean
+ EXPECT_EQ(32u, Info.computeNumSignBits(CopyOverflow));
+}
More information about the llvm-commits
mailing list