[llvm] 9a61e73 - [GlobalISel] Combine (G_*ADDO x, 0) -> x + no carry out

Jessica Paquette via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 3 14:31:17 PST 2022


Author: Jessica Paquette
Date: 2022-02-03T14:25:15-08:00
New Revision: 9a61e731ff4c2b7e19438efa277b8712d4431b04

URL: https://github.com/llvm/llvm-project/commit/9a61e731ff4c2b7e19438efa277b8712d4431b04
DIFF: https://github.com/llvm/llvm-project/commit/9a61e731ff4c2b7e19438efa277b8712d4431b04.diff

LOG: [GlobalISel] Combine (G_*ADDO x, 0) -> x + no carry out

Similar to the G_*MULO change.

The code for checking if a constant is legal/pre-legalize is shared between
these, and is kind of hairy. So, factor it out into a new function:
`isConstantLegalOrBeforeLegalizer`.

To make the refactoring clean, further refactor `isLegalOrBeforeLegalizer` into
a wrapper for two functions:

- `isPreLegalize`
- `isLegal`

This is a bit easier to read in general.

https://godbolt.org/z/KW7oszP1o

Differential Revision: https://reviews.llvm.org/D118655

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index af23060d6585c..198f120496949 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -124,10 +124,20 @@ class CombinerHelper {
 
   const TargetLowering &getTargetLowering() const;
 
+  /// \returns true if the combiner is running pre-legalization.
+  bool isPreLegalize() const;
+
+  /// \returns true if \p Query is legal on the target.
+  bool isLegal(const LegalityQuery &Query) const;
+
   /// \return true if the combine is running prior to legalization, or if \p
   /// Query is legal on the target.
   bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const;
 
+  /// \return true if the combine is running prior to legalization, or if \p Ty
+  /// is a legal integer constant type on the target.
+  bool isConstantLegalOrBeforeLegalizer(const LLT Ty) const;
+
   /// MachineRegisterInfo::replaceRegWith() and inform the observer of the changes
   void replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const;
 
@@ -649,6 +659,10 @@ class CombinerHelper {
   /// (G_*MULO x, 0) -> 0 + no carry out
   bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Match:
+  /// (G_*ADDO x, 0) -> x + no carry out
+  bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   /// Transform (fadd x, fneg(y)) -> (fsub x, y)
   ///           (fadd fneg(x), y) -> (fsub y, x)
   ///           (fsub x, fneg(y)) -> (fadd x, y)

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index c271eb0bf6ccc..ed31b6f10a31a 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -759,6 +759,12 @@ def mulo_by_0: GICombineRule<
          [{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def addo_by_0: GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_UADDO, G_SADDO):$root,
+         [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 def mulh_to_lshr : GICombineRule<
   (defs root:$root),
   (match (wip_match_opcode G_UMULH):$root,
@@ -868,7 +874,8 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
                                         fneg_fneg_fold, right_identity_one]>;
 
 def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p,
-                                     overlapping_and, mulo_by_2, mulo_by_0]>;
+                                     overlapping_and, mulo_by_2, mulo_by_0,
+                                     addo_by_0]>;
 
 def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index fba1adabd9e08..20677dcb0cd7a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -131,9 +131,27 @@ isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
   return BigEndian;
 }
 
+bool CombinerHelper::isPreLegalize() const { return !LI; }
+
+bool CombinerHelper::isLegal(const LegalityQuery &Query) const {
+  assert(LI && "Must have LegalizerInfo to query isLegal!");
+  return LI->getAction(Query).Action == LegalizeActions::Legal;
+}
+
 bool CombinerHelper::isLegalOrBeforeLegalizer(
     const LegalityQuery &Query) const {
-  return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
+  return isPreLegalize() || isLegal(Query);
+}
+
+bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const {
+  if (!Ty.isVector())
+    return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}});
+  // Vector constants are represented as a G_BUILD_VECTOR of scalar G_CONSTANTs.
+  if (isPreLegalize())
+    return true;
+  LLT EltTy = Ty.getElementType();
+  return isLegal({TargetOpcode::G_BUILD_VECTOR, {Ty, EltTy}}) &&
+         isLegal({TargetOpcode::G_CONSTANT, {EltTy}});
 }
 
 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
@@ -4593,24 +4611,9 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
     return false;
   Register Dst = MI.getOperand(0).getReg();
   Register Carry = MI.getOperand(1).getReg();
-  LLT DstTy = MRI.getType(Dst);
-  LLT CarryTy = MRI.getType(Carry);
-  if (DstTy.isVector()) {
-    LLT DstEltTy = DstTy.getElementType();
-    if (!isLegalOrBeforeLegalizer(
-            {TargetOpcode::G_BUILD_VECTOR, {DstTy, DstEltTy}}) ||
-        !isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstEltTy}}))
-      return false;
-    LLT CarryEltTy = CarryTy.getElementType();
-    if (!isLegalOrBeforeLegalizer(
-            {TargetOpcode::G_BUILD_VECTOR, {CarryTy, CarryEltTy}}) ||
-        !isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {CarryEltTy}}))
-      return false;
-  } else {
-    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {DstTy}}) ||
-        !isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {CarryTy}}))
-      return false;
-  }
+  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Dst)) ||
+      !isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
+    return false;
   MatchInfo = [=](MachineIRBuilder &B) {
     B.buildConstant(Dst, 0);
     B.buildConstant(Carry, 0);
@@ -4618,6 +4621,24 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return true;
 }
 
+bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // (G_*ADDO x, 0) -> x + no carry out
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_UADDO || Opc == TargetOpcode::G_SADDO);
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
+    return false;
+  Register Carry = MI.getOperand(1).getReg();
+  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(2).getReg();
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildCopy(Dst, LHS);
+    B.buildConstant(Carry, 0);
+  };
+  return true;
+}
+
 MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_UDIV);
   auto &UDiv = cast<GenericMachineInstr>(MI);

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
new file mode 100644
index 0000000000000..0a6b85bd45ddf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+# REQUIRES: asserts
+
+# (G_*ADDO x, 0) -> x + no carry
+
+...
+---
+name:            uadd_zero
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: uadd_zero
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %add:_(s32) = COPY %lhs(s32)
+    ; CHECK-NEXT: %o:_(s1) = G_CONSTANT i1 false
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $w0
+    %zero:_(s32) = G_CONSTANT i32 0
+    %add:_(s32), %o:_(s1) = G_UADDO %lhs, %zero
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            sadd_zero
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: sadd_zero
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %add:_(s32) = COPY %lhs(s32)
+    ; CHECK-NEXT: %o:_(s1) = G_CONSTANT i1 false
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $w0
+    %zero:_(s32) = G_CONSTANT i32 0
+    %add:_(s32), %o:_(s1) = G_SADDO %lhs, %zero
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            wrong_cst
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: wrong_cst
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:_(s32) = COPY $w0
+    ; CHECK-NEXT: %not_zero:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_UADDO %lhs, %not_zero
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $w0
+    %not_zero:_(s32) = G_CONSTANT i32 3
+    %add:_(s32), %o:_(s1) = G_UADDO %lhs, %not_zero
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            uadd_vec_zero
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $x0
+    ; CHECK-LABEL: name: uadd_vec_zero
+    ; CHECK: liveins: $q0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %add:_(<2 x s64>) = COPY %lhs(<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+    ; CHECK-NEXT: %o:_(<2 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1)
+    ; CHECK-NEXT: %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o(<2 x s1>), %zero(s64)
+    ; CHECK-NEXT: %o_wide:_(s64) = G_ZEXT %o_elt_0(s1)
+    ; CHECK-NEXT: $q0 = COPY %add(<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY %o_wide(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %lhs:_(<2 x s64>) = COPY $q0
+    %zero:_(s64) = G_CONSTANT i64 0
+    %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero
+    %add:_(<2 x s64>), %o:_(<2 x s1>) = G_UADDO %lhs, %zero_vec
+    %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o:_(<2 x s1>), %zero:_(s64)
+    %o_wide:_(s64) = G_ZEXT %o_elt_0
+    $q0 = COPY %add(<2 x s64>)
+    $x0 = COPY %o_wide
+    RET_ReallyLR implicit $q0
+...
+---
+name:            sadd_vec_zero
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $x0
+    ; CHECK-LABEL: name: sadd_vec_zero
+    ; CHECK: liveins: $q0, $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lhs:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %add:_(<2 x s64>) = COPY %lhs(<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+    ; CHECK-NEXT: %o:_(<2 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1)
+    ; CHECK-NEXT: %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o(<2 x s1>), %zero(s64)
+    ; CHECK-NEXT: %o_wide:_(s64) = G_ZEXT %o_elt_0(s1)
+    ; CHECK-NEXT: $q0 = COPY %add(<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY %o_wide(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %lhs:_(<2 x s64>) = COPY $q0
+    %zero:_(s64) = G_CONSTANT i64 0
+    %zero_vec:_(<2 x s64>) = G_BUILD_VECTOR %zero, %zero
+    %add:_(<2 x s64>), %o:_(<2 x s1>) = G_SADDO %lhs, %zero_vec
+    %o_elt_0:_(s1) = G_EXTRACT_VECTOR_ELT %o:_(<2 x s1>), %zero:_(s64)
+    %o_wide:_(s64) = G_ZEXT %o_elt_0
+    $q0 = COPY %add(<2 x s64>)
+    $x0 = COPY %o_wide
+    RET_ReallyLR implicit $q0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index c118e757ef539..77c182f1a8ac7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4155,10 +4155,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT:    v_add_i32_e64 v2, s[6:7], 0, v0
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4171,10 +4171,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    v_add_u32_e64 v2, s[6:7], 0, v0
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4187,10 +4187,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[6:7], 0, v0
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4200,13 +4200,13 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v6, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
-; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
@@ -4226,17 +4226,15 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX6-NEXT:    s_mov_b32 s3, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    s_add_u32 s0, s2, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s2, 0x80000000
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4254,17 +4252,15 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX8-NEXT:    s_mov_b32 s3, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    s_add_u32 s0, s2, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s2, 0x80000000
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4282,17 +4278,15 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    s_add_u32 s0, s2, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s2, 0x80000000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4302,23 +4296,21 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s4, s0, s2
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
 ; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s1, s3
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_xor_b32 s3, s1, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s3
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_addc_u32 s1, s2, 0x80000000
+; GFX10-NEXT:    s_xor_b32 s2, s2, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0x80000000
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s3
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
@@ -4335,10 +4327,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT:    v_add_i32_e64 v4, s[2:3], 0, v0
+; GFX6-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX6-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -4351,10 +4343,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    v_add_u32_e64 v4, s[2:3], 0, v0
+; GFX8-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX8-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -4367,10 +4359,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], 0, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -4378,13 +4370,13 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v4, 0
+; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1
-; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
@@ -4402,10 +4394,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], 0, v0
+; GFX6-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX6-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -4418,10 +4410,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], 0, v0
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -4434,10 +4426,10 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], 0, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -4446,12 +4438,12 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[0:1], 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v4, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
@@ -4469,21 +4461,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX6-NEXT:    v_bfrev_b32_e32 v10, 1
-; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], 0, v0
-; GFX6-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:    v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v2, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v3, v7, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX6-NEXT:    v_add_i32_e64 v3, s[6:7], 0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
+; GFX6-NEXT:    v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i64:
@@ -4495,21 +4486,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX8-NEXT:    v_bfrev_b32_e32 v10, 1
-; GFX8-NEXT:    v_add_u32_e64 v1, s[6:7], 0, v0
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v2, v6
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX8-NEXT:    v_add_u32_e64 v3, s[6:7], 0, v2
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i64:
@@ -4521,21 +4511,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX9-NEXT:    v_bfrev_b32_e32 v10, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v1, s[6:7], 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[8:9], v0, v10, s[6:7]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[6:7], 0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v2, v10, s[6:7]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v2i64:
@@ -4546,21 +4535,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
-; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
+; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s7, 0, v[6:7]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0x80000000, v0, s5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, v[10:11], v[2:3]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s5, 0x80000000, v4, s5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
-; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s7, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
@@ -4580,20 +4568,18 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX6-NEXT:    s_mov_b32 s10, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    s_add_u32 s0, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s5
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    s_add_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
@@ -4603,17 +4589,15 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    s_add_u32 s0, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v2
@@ -4633,20 +4617,18 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX8-NEXT:    s_mov_b32 s10, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    s_add_u32 s0, s4, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s4, s5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    s_add_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
@@ -4656,17 +4638,15 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX8-NEXT:    s_mov_b32 s6, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    s_add_u32 s0, s4, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
@@ -4686,20 +4666,18 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    s_mov_b32 s10, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    s_add_u32 s0, s4, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_brev_b32 s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_add_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
@@ -4709,17 +4687,15 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_add_u32 s0, s4, 0
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
@@ -4733,39 +4709,34 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[4:5], 0
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_mov_b32 s11, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX10-NEXT:    s_brev_b32 s10, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX10-NEXT:    s_addc_u32 s9, s1, s5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT:    s_ashr_i32 s1, s9, 31
+; GFX10-NEXT:    s_brev_b32 s10, 1
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_xor_b32 s8, s4, s0
-; GFX10-NEXT:    s_add_u32 s0, s1, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_xor_b32 s8, s4, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, s10
+; GFX10-NEXT:    s_addc_u32 s1, s0, s10
 ; GFX10-NEXT:    s_add_u32 s4, s2, s6
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s3, s7
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
-; GFX10-NEXT:    s_ashr_i32 s1, s5, 31
+; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_xor_b32 s2, s3, s2
-; GFX10-NEXT:    s_add_u32 s0, s1, 0
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_addc_u32 s1, s0, s10
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
@@ -4803,24 +4774,22 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT:    s_ashr_i32 s3, s9, 31
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX6-NEXT:    s_addc_u32 s1, s3, 0
+; GFX6-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s2, s3, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX6-NEXT:    s_addc_u32 s2, s0, 0
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s6, s6, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
@@ -4873,24 +4842,22 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    s_ashr_i32 s3, s9, 31
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX8-NEXT:    s_add_u32 s0, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s3, 0
+; GFX8-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s6, s6, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX8-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s4
@@ -4943,24 +4910,22 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_addc_u32 s1, s3, 0
+; GFX9-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s3, 0
+; GFX9-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s6, s6, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX9-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
@@ -4984,54 +4949,52 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s4, s0, s4
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s10, s[6:7], 0
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s1, s5
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s8, s2, s6
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[2:3]
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[8:9], s[2:3]
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s9
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    s_and_b32 s0, 1, s10
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[6:7], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s3, s9, 31
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s0
-; GFX10-NEXT:    s_add_u32 s0, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    s_addc_u32 s2, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
@@ -5056,7 +5019,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v2, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
 ; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -5067,17 +5029,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX6-NEXT:    s_mov_b64 vcc, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0, v3
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: saddsat_i128_sv:
@@ -5090,7 +5053,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v2, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
 ; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -5101,17 +5063,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX8-NEXT:    s_mov_b64 vcc, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0, v3
-; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: saddsat_i128_sv:
@@ -5124,7 +5087,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -5135,17 +5097,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: saddsat_i128_sv:
@@ -5165,17 +5128,17 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v3, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0x80000000, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5196,25 +5159,25 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX6-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT:    s_mov_b64 vcc, 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: saddsat_i128_vs:
@@ -5241,17 +5204,17 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: saddsat_i128_vs:
@@ -5278,17 +5241,17 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: saddsat_i128_vs:
@@ -5310,18 +5273,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5349,42 +5312,41 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v1, v18, vcc
+; GFX6-NEXT:    s_mov_b64 vcc, 0
+; GFX6-NEXT:    v_addc_u32_e64 v2, s[4:5], 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5]
+; GFX6-NEXT:    v_addc_u32_e64 v10, s[4:5], v1, v18, s[4:5]
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v10, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v12
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v5, v13, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, v6, v14, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v7, v15, vcc
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v16, v3, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v17, v10, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v8, s[4:5], v4, v12
+; GFX6-NEXT:    v_addc_u32_e64 v9, s[4:5], v5, v13, s[4:5]
+; GFX6-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v14, s[4:5]
+; GFX6-NEXT:    v_addc_u32_e64 v11, s[4:5], v7, v15, s[4:5]
+; GFX6-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0, v5
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, v5, v18, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v12, vcc, v5, v18, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i128:
@@ -5407,42 +5369,41 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v1, v18, vcc
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    v_addc_u32_e64 v2, s[4:5], 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[4:5], v1, v18, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v12
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v5, v13, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, v6, v14, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v7, v15, vcc
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v16, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v17, v10, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v8, s[4:5], v4, v12
+; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v5, v13, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v14, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e64 v11, s[4:5], v7, v15, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0, v5
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, v5, v18, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, v5, v18, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i128:
@@ -5465,42 +5426,41 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v18, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, v1, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[4:5], v1, v18, s[4:5]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v13, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v6, v14, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v15, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v16, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v17, v10, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], v4, v12
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v5, v13, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[4:5], v6, v14, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], v7, v15, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v5, v18, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v5, v18, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v2i128:
@@ -5525,40 +5485,39 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[10:11], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, v[10:11], v[4:5]
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[12:13], v[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s6, v[12:13], v[6:7]
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[12:13], v[6:7]
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s5
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[14:15]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v18, 0, s5
-; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v2, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v16, v5, s4
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v4
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v7, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v17, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v13
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[14:15]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s6
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[14:15]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s4, 0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, 0, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s4, 0x80000000, v1, s4
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v2, s5
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v16, v4, s5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v17, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v9, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -5592,23 +5551,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT:    s_ashr_i32 s3, s17, 31
+; GFX6-NEXT:    s_brev_b32 s10, 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX6-NEXT:    s_addc_u32 s1, s3, 0
+; GFX6-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s2, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_and_b32 s11, s11, 1
-; GFX6-NEXT:    s_brev_b32 s10, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_addc_u32 s2, s0, 0
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    s_addc_u32 s3, s3, s10
+; GFX6-NEXT:    s_addc_u32 s3, s0, s10
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    s_add_u32 s0, s4, s12
@@ -5649,24 +5606,22 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT:    s_ashr_i32 s7, s3, 31
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
-; GFX6-NEXT:    s_add_u32 s4, s7, 0
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_and_b32 s5, s5, 1
+; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX6-NEXT:    s_addc_u32 s5, s7, 0
+; GFX6-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_and_b32 s6, s6, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_addc_u32 s6, s7, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_addc_u32 s6, s4, 0
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s8, s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX6-NEXT:    s_and_b32 s7, s7, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s7, s7, s10
+; GFX6-NEXT:    s_addc_u32 s7, s4, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s0
@@ -5723,23 +5678,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    s_ashr_i32 s3, s17, 31
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX8-NEXT:    s_add_u32 s0, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_and_b32 s11, s11, 1
+; GFX8-NEXT:    s_addc_u32 s2, s0, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_brev_b32 s10, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    s_addc_u32 s3, s3, s10
+; GFX8-NEXT:    s_addc_u32 s3, s0, s10
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    s_add_u32 s0, s4, s12
@@ -5786,24 +5739,22 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
-; GFX8-NEXT:    s_ashr_i32 s7, s3, 31
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s7, 0
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX8-NEXT:    s_addc_u32 s5, s7, 0
+; GFX8-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_and_b32 s6, s6, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX8-NEXT:    s_addc_u32 s6, s7, 0
+; GFX8-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s8, s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX8-NEXT:    s_and_b32 s7, s7, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s7, s7, s10
+; GFX8-NEXT:    s_addc_u32 s7, s4, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
@@ -5860,23 +5811,21 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    s_ashr_i32 s3, s17, 31
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_addc_u32 s1, s3, 0
+; GFX9-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_and_b32 s11, s11, 1
+; GFX9-NEXT:    s_addc_u32 s2, s0, 0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_brev_b32 s10, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_addc_u32 s3, s3, s10
+; GFX9-NEXT:    s_addc_u32 s3, s0, s10
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_add_u32 s0, s4, s12
@@ -5923,24 +5872,22 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
-; GFX9-NEXT:    s_ashr_i32 s7, s3, 31
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s7, 0
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX9-NEXT:    s_addc_u32 s5, s7, 0
+; GFX9-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_and_b32 s6, s6, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_addc_u32 s6, s7, 0
+; GFX9-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s8, s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX9-NEXT:    s_and_b32 s7, s7, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s7, s7, s10
+; GFX9-NEXT:    s_addc_u32 s7, s4, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -5974,9 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s16, s16, 1
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX10-NEXT:    s_addc_u32 s16, s2, s10
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -5985,80 +5930,78 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s17
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s18
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s3, s17, 31
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_brev_b32 s10, 1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s0
-; GFX10-NEXT:    s_add_u32 s0, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX10-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s8
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX10-NEXT:    s_and_b32 s2, s2, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    s_addc_u32 s2, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_and_b32 s11, s11, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10-NEXT:    s_addc_u32 s3, s3, s10
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
 ; GFX10-NEXT:    s_add_u32 s0, s4, s12
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s16
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s16
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[14:15], 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[14:15], 0
 ; GFX10-NEXT:    s_addc_u32 s8, s6, s14
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s8
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
 ; GFX10-NEXT:    s_addc_u32 s9, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[6:7]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
 ; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s9
 ; GFX10-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s5, s9, 31
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    s_and_b32 s3, 1, s2
-; GFX10-NEXT:    s_add_u32 s2, s5, 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s3, 0, s3
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s3
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, 1, s3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, s3
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, 0, s2
+; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s1
+; GFX10-NEXT:    s_addc_u32 s3, s2, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
@@ -6066,12 +6009,12 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    s_addc_u32 s4, s5, 0
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    s_addc_u32 s4, s2, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    s_addc_u32 s1, s5, s10
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_addc_u32 s1, s2, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 1f8a322dbdd12..dbd4eab951828 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4141,10 +4141,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT:    v_add_i32_e64 v2, s[6:7], 0, v0
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX6-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4157,10 +4157,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    v_add_u32_e64 v2, s[6:7], 0, v0
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4173,10 +4173,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[6:7], 0, v0
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4186,13 +4186,13 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v0, s5, v6, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
-; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
@@ -4212,17 +4212,15 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX6-NEXT:    s_mov_b32 s3, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    s_add_u32 s0, s2, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s2, 0x80000000
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4240,17 +4238,15 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX8-NEXT:    s_mov_b32 s3, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    s_add_u32 s0, s2, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s2, 0x80000000
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4268,17 +4264,15 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    s_add_u32 s0, s2, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s2, 0x80000000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
@@ -4288,23 +4282,21 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s4, s0, s2
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], 0
 ; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_subb_u32 s5, s1, s3
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_xor_b32 s3, s1, s0
-; GFX10-NEXT:    s_add_u32 s0, s2, 0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s3
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_addc_u32 s1, s2, 0x80000000
+; GFX10-NEXT:    s_xor_b32 s2, s2, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0x80000000
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s3
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
@@ -4321,10 +4313,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT:    v_add_i32_e64 v4, s[2:3], 0, v0
+; GFX6-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX6-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -4337,10 +4329,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    v_add_u32_e64 v4, s[2:3], 0, v0
+; GFX8-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX8-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -4353,10 +4345,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], 0, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -4364,13 +4356,13 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v4, 0
+; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1
-; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
@@ -4388,10 +4380,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], 0, v0
+; GFX6-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX6-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -4404,10 +4396,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], 0, v0
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -4420,10 +4412,10 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], 0, v0
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -4432,12 +4424,12 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[0:1], 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v4, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
@@ -4455,21 +4447,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX6-NEXT:    v_bfrev_b32_e32 v10, 1
-; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], 0, v0
-; GFX6-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], 0
+; GFX6-NEXT:    v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v2, v6
 ; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v3, v7, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX6-NEXT:    v_add_i32_e64 v3, s[6:7], 0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
+; GFX6-NEXT:    v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7]
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i64:
@@ -4481,21 +4472,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX8-NEXT:    v_bfrev_b32_e32 v10, 1
-; GFX8-NEXT:    v_add_u32_e64 v1, s[6:7], 0, v0
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_addc_u32_e64 v1, s[8:9], v0, v10, s[6:7]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v2, v6
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX8-NEXT:    v_add_u32_e64 v3, s[6:7], 0, v2
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[6:7], v2, v10, s[6:7]
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i64:
@@ -4507,21 +4497,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GFX9-NEXT:    v_bfrev_b32_e32 v10, 1
-; GFX9-NEXT:    v_add_co_u32_e64 v1, s[6:7], 0, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7]
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[8:9], v0, v10, s[6:7]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v2, v6
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[6:7], 0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v2, v10, s[6:7]
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v2i64:
@@ -4532,21 +4521,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
-; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
+; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s7, 0, v[6:7]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0x80000000, v0, s5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, v[10:11], v[2:3]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s5, 0x80000000, v4, s5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
-; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 vcc_lo, s7, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
@@ -4566,20 +4554,18 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX6-NEXT:    s_mov_b32 s10, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT:    s_add_u32 s0, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s5
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX6-NEXT:    s_and_b32 s1, s1, 1
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
@@ -4589,17 +4575,15 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    s_add_u32 s0, s4, 0
-; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX6-NEXT:    s_and_b32 s2, s2, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v2
@@ -4619,20 +4603,18 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX8-NEXT:    s_mov_b32 s10, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT:    s_add_u32 s0, s4, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s4, s5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
@@ -4642,17 +4624,15 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX8-NEXT:    s_mov_b32 s6, 0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    s_add_u32 s0, s4, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, s2, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
@@ -4672,20 +4652,18 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    s_mov_b32 s10, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT:    s_add_u32 s0, s4, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_brev_b32 s5, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s4, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
@@ -4695,17 +4673,15 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    s_add_u32 s0, s4, 0
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_and_b32 s2, s2, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s4, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
@@ -4719,39 +4695,34 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, s[4:5], 0
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_mov_b32 s11, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX10-NEXT:    s_brev_b32 s10, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX10-NEXT:    s_subb_u32 s9, s1, s5
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT:    s_ashr_i32 s1, s9, 31
+; GFX10-NEXT:    s_brev_b32 s10, 1
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_xor_b32 s8, s4, s0
-; GFX10-NEXT:    s_add_u32 s0, s1, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_xor_b32 s8, s4, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, s10
+; GFX10-NEXT:    s_addc_u32 s1, s0, s10
 ; GFX10-NEXT:    s_sub_u32 s4, s2, s6
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_subb_u32 s5, s3, s7
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
-; GFX10-NEXT:    s_ashr_i32 s1, s5, 31
+; GFX10-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_xor_b32 s2, s3, s2
-; GFX10-NEXT:    s_add_u32 s0, s1, 0
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_addc_u32 s1, s0, s10
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_addc_u32 s1, s1, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
@@ -4789,26 +4760,24 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT:    s_ashr_i32 s3, s11, 31
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX6-NEXT:    s_addc_u32 s1, s3, 0
+; GFX6-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s2, s3, 0
+; GFX6-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s4, s4, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_and_b32 s3, s3, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX6-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
@@ -4863,24 +4832,22 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    s_ashr_i32 s3, s11, 31
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_add_u32 s0, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s3, 0
+; GFX8-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s4, s4, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX8-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s8
@@ -4935,24 +4902,22 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
-; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_add_u32 s0, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_addc_u32 s1, s3, 0
+; GFX9-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s3, 0
+; GFX9-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s4, s4, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX9-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
@@ -4987,45 +4952,43 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s11, s11, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    s_subb_u32 s11, s3, s7
-; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
+; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[4:5], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s3, s11, 31
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[6:7], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_add_u32 s0, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s9
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s8
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_addc_u32 s2, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s10
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    s_addc_u32 s3, s3, 0x80000000
+; GFX10-NEXT:    s_addc_u32 s3, s0, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
@@ -5060,20 +5023,20 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX6-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_mov_b64 vcc, 0
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: ssubsat_i128_sv:
@@ -5096,20 +5059,20 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: ssubsat_i128_sv:
@@ -5132,20 +5095,20 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: ssubsat_i128_sv:
@@ -5167,17 +5130,17 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5198,9 +5161,9 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX6-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
@@ -5209,16 +5172,16 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX6-NEXT:    s_mov_b64 vcc, 0
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: ssubsat_i128_vs:
@@ -5241,23 +5204,23 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX8-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s4
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: ssubsat_i128_vs:
@@ -5280,23 +5243,23 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s4
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: ssubsat_i128_vs:
@@ -5320,18 +5283,18 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0x80000000, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s0
 ; GFX10-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5361,44 +5324,43 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v20, vcc
+; GFX6-NEXT:    s_mov_b64 vcc, 0
+; GFX6-NEXT:    v_addc_u32_e64 v2, s[4:5], 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5]
+; GFX6-NEXT:    v_addc_u32_e64 v8, s[4:5], v1, v20, s[4:5]
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v18, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v19, v9, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v4, v12
-; GFX6-NEXT:    v_subb_u32_e32 v9, vcc, v5, v13, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v10, vcc, v6, v14, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v11, vcc, v7, v15, vcc
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v16, v1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v17, v2, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v18, v3, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v19, v8, s[4:5]
+; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
+; GFX6-NEXT:    v_subb_u32_e64 v9, s[4:5], v5, v13, s[4:5]
+; GFX6-NEXT:    v_subb_u32_e64 v10, s[4:5], v6, v14, s[4:5]
+; GFX6-NEXT:    v_subb_u32_e64 v11, s[4:5], v7, v15, s[4:5]
+; GFX6-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX6-NEXT:    v_cmp_lt_u64_e64 s[4:5], 0, v[12:13]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[14:15]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[4:5]
 ; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0, v5
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, v5, v20, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v12, vcc, v5, v20, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i128:
@@ -5423,44 +5385,43 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v20, vcc
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    v_addc_u32_e64 v2, s[4:5], 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, v1, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e64 v8, s[4:5], v1, v20, s[4:5]
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v9, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v4, v12
-; GFX8-NEXT:    v_subb_u32_e32 v9, vcc, v5, v13, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v10, vcc, v6, v14, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v11, vcc, v7, v15, vcc
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v17, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v18, v3, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v8, s[4:5]
+; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
+; GFX8-NEXT:    v_subb_u32_e64 v9, s[4:5], v5, v13, s[4:5]
+; GFX8-NEXT:    v_subb_u32_e64 v10, s[4:5], v6, v14, s[4:5]
+; GFX8-NEXT:    v_subb_u32_e64 v11, s[4:5], v7, v15, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_u64_e64 s[4:5], 0, v[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX8-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[4:5]
 ; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0, v5
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v5, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, v5, v20, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, v5, v20, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i128:
@@ -5485,44 +5446,43 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v20, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, v1, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[4:5], v1, v20, s[4:5]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v9, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v4, v12
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v5, v13, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v6, v14, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v15, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v16, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v17, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v18, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v8, s[4:5]
+; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v4, v12
+; GFX9-NEXT:    v_subb_co_u32_e64 v9, s[4:5], v5, v13, s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e64 v10, s[4:5], v6, v14, s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e64 v11, s[4:5], v7, v15, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[4:5], 0, v[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v5, v20, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v5, v20, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v2i128:
@@ -5549,42 +5509,41 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, v[8:9], v[4:5]
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[10:11], v[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s6, v[10:11], v[6:7]
+; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[6:7]
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, 0, v[12:13]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s4
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s4
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s5
-; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[14:15]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v13, v12, s5
-; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v2, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v5, s4
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v4
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v7, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s5
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, 0, v[12:13]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, 0, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s6
+; GFX10-NEXT:    v_cmp_eq_u64_e64 s6, 0, v[14:15]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s4, 0, v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v12, v5, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s4, 0x80000000, v1, s4
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0x80000000, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v2, s5
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v4, s5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v13, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -5618,25 +5577,23 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT:    s_ashr_i32 s3, s19, 31
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_ashr_i32 s0, s19, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX6-NEXT:    s_addc_u32 s1, s3, 0
+; GFX6-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s2, s3, 0
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_addc_u32 s2, s0, 0
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
+; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    s_brev_b32 s8, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_addc_u32 s3, s0, s8
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    s_sub_u32 s0, s4, s12
@@ -5677,26 +5634,24 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT:    s_ashr_i32 s7, s3, 31
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX6-NEXT:    s_add_u32 s4, s7, 0
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX6-NEXT:    s_and_b32 s5, s5, 1
+; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX6-NEXT:    s_addc_u32 s5, s7, 0
+; GFX6-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_and_b32 s6, s6, 1
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_addc_u32 s6, s7, 0
+; GFX6-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT:    s_and_b32 s9, s9, 1
-; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX6-NEXT:    s_and_b32 s7, s7, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_addc_u32 s7, s7, s8
+; GFX6-NEXT:    s_addc_u32 s7, s4, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s0
@@ -5755,23 +5710,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    s_ashr_i32 s3, s19, 31
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_add_u32 s0, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_ashr_i32 s0, s19, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX8-NEXT:    s_addc_u32 s2, s3, 0
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
+; GFX8-NEXT:    s_addc_u32 s2, s0, 0
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_brev_b32 s8, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    s_addc_u32 s3, s3, s8
+; GFX8-NEXT:    s_addc_u32 s3, s0, s8
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    s_sub_u32 s0, s4, s12
@@ -5820,24 +5773,22 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
-; GFX8-NEXT:    s_ashr_i32 s7, s3, 31
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_add_u32 s4, s7, 0
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX8-NEXT:    s_addc_u32 s5, s7, 0
+; GFX8-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_and_b32 s6, s6, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX8-NEXT:    s_addc_u32 s6, s7, 0
+; GFX8-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_and_b32 s9, s9, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX8-NEXT:    s_and_b32 s7, s7, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_addc_u32 s7, s7, s8
+; GFX8-NEXT:    s_addc_u32 s7, s4, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
@@ -5896,23 +5847,21 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
-; GFX9-NEXT:    s_ashr_i32 s3, s19, 31
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_add_u32 s0, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_ashr_i32 s0, s19, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_addc_u32 s1, s3, 0
+; GFX9-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_addc_u32 s2, s3, 0
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
+; GFX9-NEXT:    s_addc_u32 s2, s0, 0
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_brev_b32 s8, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_addc_u32 s3, s3, s8
+; GFX9-NEXT:    s_addc_u32 s3, s0, s8
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_sub_u32 s0, s4, s12
@@ -5961,24 +5910,22 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
-; GFX9-NEXT:    s_ashr_i32 s7, s3, 31
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_add_u32 s4, s7, 0
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX9-NEXT:    s_addc_u32 s5, s7, 0
+; GFX9-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_and_b32 s6, s6, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_addc_u32 s6, s7, 0
+; GFX9-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    s_and_b32 s9, s9, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_and_b32 s7, s7, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_addc_u32 s7, s7, s8
+; GFX9-NEXT:    s_addc_u32 s7, s4, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
@@ -6012,7 +5959,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s18, s18, 1
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s1, s[8:9], 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_subb_u32 s18, s2, s10
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
@@ -6022,98 +5968,94 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s19
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s20
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s3, s19, 31
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[10:11], 0
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_add_u32 s0, s3, 0
-; GFX10-NEXT:    s_brev_b32 s10, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v2, s17
-; GFX10-NEXT:    s_addc_u32 s1, s3, 0
+; GFX10-NEXT:    s_mov_b32 s10, 0
+; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_ashr_i32 s0, s19, 31
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_brev_b32 s11, 1
+; GFX10-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v1, s16
+; GFX10-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s19
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_addc_u32 s2, s0, 0
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s16
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    s_addc_u32 s2, s3, 0
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_addc_u32 s3, s0, s11
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
 ; GFX10-NEXT:    s_sub_u32 s0, s4, s12
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
-; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s18
-; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s18
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
 ; GFX10-NEXT:    s_and_b32 s8, s8, 1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s4, s[12:13], 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s3, s[12:13], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
 ; GFX10-NEXT:    s_subb_u32 s8, s6, s14
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s3
 ; GFX10-NEXT:    s_and_b32 s9, s9, 1
-; GFX10-NEXT:    v_mov_b32_e32 v7, s8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, s[14:15], 0
 ; GFX10-NEXT:    s_subb_u32 s9, s7, s15
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[6:7]
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
 ; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4
 ; GFX10-NEXT:    s_and_b32 s2, 1, s2
 ; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
-; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s5, s9, 31
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[14:15], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s3
-; GFX10-NEXT:    s_and_b32 s3, 1, s2
-; GFX10-NEXT:    s_add_u32 s2, s5, 0
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
+; GFX10-NEXT:    s_and_b32 s3, 1, s3
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_addc_u32 s3, s2, 0
 ; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, s9
 ; GFX10-NEXT:    s_and_b32 s4, s4, 1
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc_lo
 ; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    s_addc_u32 s3, s5, 0
-; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_addc_u32 s4, s2, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX10-NEXT:    s_and_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, s8
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    s_addc_u32 s4, s5, 0
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-NEXT:    s_addc_u32 s1, s2, s11
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX10-NEXT:    s_addc_u32 s1, s5, s10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo


        


More information about the llvm-commits mailing list