[llvm] 5f77461 - [GlobalIsel] Combine ADDO (#82927)

via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 14 04:45:23 PDT 2024


Author: Thorsten Schütt
Date: 2024-03-14T12:45:19+01:00
New Revision: 5f774619eac5db73398225a4c924a9c1d437fb40

URL: https://github.com/llvm/llvm-project/commit/5f774619eac5db73398225a4c924a9c1d437fb40
DIFF: https://github.com/llvm/llvm-project/commit/5f774619eac5db73398225a4c924a9c1d437fb40.diff

LOG: [GlobalIsel] Combine ADDO (#82927)

Perform the requested arithmetic and produce a carry output in addition
to the normal result.

Clang has them as builtins (__builtin_add_overflow_p). The middle end
has intrinsics for them (sadd_with_overflow).

AArch64: ADDS Add and set flags

On Neoverse V2, they run at half the throughput of basic arithmetic and
have a limited set of pipelines.

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
    llvm/test/CodeGen/AArch64/overflow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 23728636498ba0..9e8fc5d635c50a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -696,10 +696,6 @@ class CombinerHelper {
   /// (G_*MULO x, 0) -> 0 + no carry out
   bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Match:
-  /// (G_*ADDO x, 0) -> x + no carry out
-  bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
-
   /// Match:
   /// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
   /// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -810,12 +806,15 @@ class CombinerHelper {
   /// Combine selects.
   bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine ands,
+  /// Combine ands.
   bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine ors,
+  /// Combine ors.
   bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Combine addos.
+  bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -919,6 +918,7 @@ class CombinerHelper {
   bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
   bool isConstantSplatVector(Register Src, int64_t SplatValue,
                              bool AllowUndefs);
+  bool isConstantOrConstantVectorI(Register Src) const;
 
   std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
 

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index f5a6528d10a973..6b03703192df91 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr {
   Register getCarryOutReg() const { return getReg(1); }
   MachineOperand &getLHS() { return getOperand(2); }
   MachineOperand &getRHS() { return getOperand(3); }
+  Register getLHSReg() const { return getOperand(2).getReg(); }
+  Register getRHSReg() const { return getOperand(3).getReg(); }
 
   static bool classof(const MachineInstr *MI) {
     switch (MI->getOpcode()) {
@@ -429,6 +431,23 @@ class GAddSubCarryOut : public GBinOpCarryOut {
   }
 };
 
+/// Represents overflowing add operations.
+/// G_UADDO, G_SADDO
+class GAddCarryOut : public GBinOpCarryOut {
+public:
+  bool isSigned() const { return getOpcode() == TargetOpcode::G_SADDO; }
+
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_UADDO:
+    case TargetOpcode::G_SADDO:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
 /// Represents overflowing add/sub operations that also consume a carry-in.
 /// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
 class GAddSubCarryInOut : public GAddSubCarryOut {

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9f18a5b8560098..6980cbd04aeb1c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1090,12 +1090,6 @@ def mulo_by_0: GICombineRule<
          [{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
-def addo_by_0: GICombineRule<
-  (defs root:$root, build_fn_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_UADDO, G_SADDO):$root,
-         [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-
 // Transform (uadde x, y, 0) -> (uaddo x, y)
 //           (sadde x, y, 0) -> (saddo x, y)
 //           (usube x, y, 0) -> (usubo x, y)
@@ -1291,6 +1285,12 @@ def match_ors : GICombineRule<
         [{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def match_addos : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SADDO, G_UADDO):$root,
+        [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // Combines concat operations
 def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 def combine_concat_vector : GICombineRule<
@@ -1326,7 +1326,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
 
 def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
                                      overlapping_and, mulo_by_2, mulo_by_0,
-                                     addo_by_0, adde_to_addo,
+                                     adde_to_addo,
                                      combine_minmax_nan]>;
 
 def known_bits_simplifications : GICombineGroup<[
@@ -1374,7 +1374,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
-    combine_concat_vector, double_icmp_zero_and_or_combine]>;
+    combine_concat_vector, double_icmp_zero_and_or_combine, match_addos]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 2e706b46801938..bee49dbd0f8380 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4936,24 +4936,6 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return true;
 }
 
-bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
-  // (G_*ADDO x, 0) -> x + no carry out
-  assert(MI.getOpcode() == TargetOpcode::G_UADDO ||
-         MI.getOpcode() == TargetOpcode::G_SADDO);
-  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
-    return false;
-  Register Carry = MI.getOperand(1).getReg();
-  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
-    return false;
-  Register Dst = MI.getOperand(0).getReg();
-  Register LHS = MI.getOperand(2).getReg();
-  MatchInfo = [=](MachineIRBuilder &B) {
-    B.buildCopy(Dst, LHS);
-    B.buildConstant(Carry, 0);
-  };
-  return true;
-}
-
 bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
   // (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
   // (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -6354,6 +6336,26 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
   return Value;
 }
 
+// FIXME G_SPLAT_VECTOR
+bool CombinerHelper::isConstantOrConstantVectorI(Register Src) const {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return true;
+
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+
+  unsigned NumSources = BuildVector->getNumSources();
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return false;
+  }
+  return true;
+}
+
 // TODO: use knownbits to determine zeros
 bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
                                               BuildFnTy &MatchInfo) {
@@ -6928,3 +6930,178 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
 
   return false;
 }
+
+bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  GAddCarryOut *Add = cast<GAddCarryOut>(&MI);
+
+  // Addo has no flags
+  Register Dst = Add->getReg(0);
+  Register Carry = Add->getReg(1);
+  Register LHS = Add->getLHSReg();
+  Register RHS = Add->getRHSReg();
+  bool IsSigned = Add->isSigned();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+
+  // We want do fold the [u|s]addo.
+  if (!MRI.hasOneNonDBGUse(Dst))
+    return false;
+
+  // Fold addo, if the carry is dead -> add, undef.
+  if (MRI.use_nodbg_empty(Carry) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS);
+      B.buildUndef(Carry);
+    };
+    return true;
+  }
+
+  // We want do fold the [u|s]addo.
+  if (!MRI.hasOneNonDBGUse(Carry))
+    return false;
+
+  // Canonicalize constant to RHS.
+  if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) {
+    if (IsSigned) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSAddo(Dst, Carry, RHS, LHS);
+      };
+      return true;
+    }
+    // !IsSigned
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildUAddo(Dst, Carry, RHS, LHS);
+    };
+    return true;
+  }
+
+  std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS);
+  std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS);
+
+  // Fold addo(c1, c2) -> c3, carry.
+  if (MaybeLHS && MaybeRHS && isConstantLegalOrBeforeLegalizer(DstTy) &&
+      isConstantLegalOrBeforeLegalizer(CarryTy)) {
+    bool Overflow;
+    APInt Result = IsSigned ? MaybeLHS->sadd_ov(*MaybeRHS, Overflow)
+                            : MaybeLHS->uadd_ov(*MaybeRHS, Overflow);
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildConstant(Dst, Result);
+      B.buildConstant(Carry, Overflow);
+    };
+    return true;
+  }
+
+  // Fold (addo x, 0) -> x, no borrow
+  if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildCopy(Dst, LHS);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+
+  // Given 2 constant operands whose sum does not overflow:
+  // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
+  // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
+  GAdd *AddLHS = getOpcodeDef<GAdd>(LHS, MRI);
+  if (MaybeRHS && AddLHS && MRI.hasOneNonDBGUse(Add->getReg(0)) &&
+      ((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) ||
+       (!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) {
+    std::optional<APInt> MaybeAddRHS =
+        getConstantOrConstantSplatVector(AddLHS->getRHSReg());
+    if (MaybeAddRHS) {
+      bool Overflow;
+      APInt NewC = IsSigned ? MaybeAddRHS->sadd_ov(*MaybeRHS, Overflow)
+                            : MaybeAddRHS->uadd_ov(*MaybeRHS, Overflow);
+      if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) {
+        if (IsSigned) {
+          MatchInfo = [=](MachineIRBuilder &B) {
+            auto ConstRHS = B.buildConstant(DstTy, NewC);
+            B.buildSAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+          };
+          return true;
+        }
+        // !IsSigned
+        MatchInfo = [=](MachineIRBuilder &B) {
+          auto ConstRHS = B.buildConstant(DstTy, NewC);
+          B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+        };
+        return true;
+      }
+    }
+  };
+
+  // We try to combine addo to non-overflowing add.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) ||
+      !isConstantLegalOrBeforeLegalizer(CarryTy))
+    return false;
+
+  // We try to combine uaddo to non-overflowing add.
+  if (!IsSigned) {
+    ConstantRange CRLHS =
+        ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/false);
+    ConstantRange CRRHS =
+        ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/false);
+
+    switch (CRLHS.unsignedAddMayOverflow(CRRHS)) {
+    case ConstantRange::OverflowResult::MayOverflow:
+      return false;
+    case ConstantRange::OverflowResult::NeverOverflows: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+        B.buildConstant(Carry, 0);
+      };
+      return true;
+    }
+    case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+    case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildAdd(Dst, LHS, RHS);
+        B.buildConstant(Carry, 1);
+      };
+      return true;
+    }
+    }
+    return false;
+  }
+
+  // We try to combine saddo to non-overflowing add.
+
+  // If LHS and RHS each have at least two sign bits, then there is no signed
+  // overflow.
+  if (KB->computeNumSignBits(RHS) > 1 && KB->computeNumSignBits(LHS) > 1) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+
+  ConstantRange CRLHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/true);
+  ConstantRange CRRHS =
+      ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/true);
+
+  switch (CRLHS.signedAddMayOverflow(CRRHS)) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return false;
+  case ConstantRange::OverflowResult::NeverOverflows: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+      B.buildConstant(Carry, 0);
+    };
+    return true;
+  }
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildAdd(Dst, LHS, RHS);
+      B.buildConstant(Carry, 1);
+    };
+    return true;
+  }
+  }
+
+  return false;
+}

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
new file mode 100644
index 00000000000000..6fced31a622d9d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+---
+name:            add_unused
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_unused
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %add:_(s32), %o:_(s1) = G_SADDO %0, %1
+    $w0 = COPY %add(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_canon
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_canon
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_SADDO [[COPY]], %const
+    ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %const:_(s32) = G_CONSTANT i32 10
+    %add:_(s32), %o:_(s1) = G_SADDO %const, %1
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_const_fold
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_const_fold
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %add:_(s32) = G_CONSTANT i32 21
+    ; CHECK-NEXT: %o_wide:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %const:_(s32) = G_CONSTANT i32 10
+    %const1:_(s32) = G_CONSTANT i32 11
+    %add:_(s32), %o:_(s1) = G_UADDO %const, %const1
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_add_zero
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_add_zero
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[C]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %const:_(s32) = G_CONSTANT i32 10
+    %addl:_(s32) = nsw G_ADD  %2, %const
+    %const1:_(s32) = G_CONSTANT i32 -10
+    %add:_(s32), %o:_(s1) = G_SADDO %addl, %const1
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
index 94f56e5650b22f..9483cbf06f4057 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="match_addos" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
 # REQUIRES: asserts
 
 # (G_*ADDO x, 0) -> x + no carry

diff  --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll
index 444aaeb0f3fe75..1fd60c03097906 100644
--- a/llvm/test/CodeGen/AArch64/overflow.ll
+++ b/llvm/test/CodeGen/AArch64/overflow.ll
@@ -19,20 +19,12 @@ entry:
 }
 
 define zeroext i1 @saddo1.i32.fold(i32 %v1, i32 %v2, ptr %res) {
-; SDAG-LABEL: saddo1.i32.fold:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w8, #20 // =0x14
-; SDAG-NEXT:    mov w0, wzr
-; SDAG-NEXT:    str w8, [x2]
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: saddo1.i32.fold:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mov w8, #9 // =0x9
-; GISEL-NEXT:    adds w8, w8, #11
-; GISEL-NEXT:    cset w0, vs
-; GISEL-NEXT:    str w8, [x2]
-; GISEL-NEXT:    ret
+; CHECK-LABEL: saddo1.i32.fold:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #20 // =0x14
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 9, i32 11)
   %val = extractvalue {i32, i1} %t, 0
@@ -123,18 +115,11 @@ entry:
 }
 
 define zeroext i1 @saddo.canon.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; SDAG-LABEL: saddo.canon.i32:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w0, wzr
-; SDAG-NEXT:    str w4, [x5]
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: saddo.canon.i32:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    adds w8, wzr, w4
-; GISEL-NEXT:    cset w0, vs
-; GISEL-NEXT:    str w8, [x5]
-; GISEL-NEXT:    ret
+; CHECK-LABEL: saddo.canon.i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w4, [x5]
+; CHECK-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 0, i32 %v5)
   %val = extractvalue {i32, i1} %t, 0
@@ -143,13 +128,19 @@ entry:
   ret i1 %obit
 }
 define zeroext i1 @saddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; CHECK-LABEL: saddo.add.i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add w8, w4, #100
-; CHECK-NEXT:    subs w8, w8, #100
-; CHECK-NEXT:    cset w0, vs
-; CHECK-NEXT:    str w8, [x5]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: saddo.add.i32:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add w8, w4, #100
+; SDAG-NEXT:    subs w8, w8, #100
+; SDAG-NEXT:    cset w0, vs
+; SDAG-NEXT:    str w8, [x5]
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: saddo.add.i32:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    mov w0, wzr
+; GISEL-NEXT:    str w4, [x5]
+; GISEL-NEXT:    ret
 entry:
   %lhs = add nsw i32 %v5, 100
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %lhs, i32 -100)
@@ -160,13 +151,20 @@ entry:
 }
 
 define zeroext i1 @uaddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; CHECK-LABEL: uaddo.add.i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add w8, w4, #5
-; CHECK-NEXT:    adds w8, w8, #5
-; CHECK-NEXT:    cset w0, hs
-; CHECK-NEXT:    str w8, [x5]
-; CHECK-NEXT:    ret
+; SDAG-LABEL: uaddo.add.i32:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    add w8, w4, #5
+; SDAG-NEXT:    adds w8, w8, #5
+; SDAG-NEXT:    cset w0, hs
+; SDAG-NEXT:    str w8, [x5]
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: uaddo.add.i32:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    adds w8, w4, #10
+; GISEL-NEXT:    cset w0, hs
+; GISEL-NEXT:    str w8, [x5]
+; GISEL-NEXT:    ret
 entry:
   %lhs = add nuw i32 %v5, 5
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %lhs, i32 5)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0ea89d98..a6f9bb7ee055d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX6-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX6-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX8-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX8-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9-LABEL: saddsat_i48_vs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX6-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX8-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v12
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX10-NEXT:    s_add_u32 s0, s2, s6
 ; GFX10-NEXT:    s_addc_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
 ; GFX11-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX11-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX11-NEXT:    s_add_u32 s0, s2, s6
 ; GFX11-NEXT:    s_addc_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
@@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
@@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
@@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX10-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX11-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX9-NEXT:    v_bfrev_b32_e32 v6, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v3, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_add_u32_e32 v6, 0x80000000, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_co_u32 v6, s0, 0x80000000, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_co_u32 v6, null, 0x80000000, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
@@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_add_u32_e32 v7, 0x80000000, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
@@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
-; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v3, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s4
@@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_add_co_u32 v7, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
@@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
@@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
@@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
@@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
@@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    s_ashr_i32 s10, s17, 31
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    s_add_u32 s11, s10, 0x80000000
+; GFX10-NEXT:    s_add_i32 s11, s10, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    s_add_u32 s0, s4, s12
@@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
@@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT:    s_add_u32 s11, s10, 0x80000000
+; GFX11-NEXT:    s_add_i32 s11, s10, 0x80000000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX11-NEXT:    s_add_u32 s0, s4, s12
@@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0a6b7af2f78d4c..84906c01a4698a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -3091,253 +3091,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0, v1
-; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], 0, 0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6]
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8]
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v11, v7
+; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], 0, v2
-; GISEL-NEXT:    v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v4
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v2
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v13
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v6, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], 0, v2
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v2
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v6, v1
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v4
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0
+; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6]
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6]
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v18, v1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v1
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v14, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v17
-; GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v18, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v17, v13, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v18, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v13, v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v12
-; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v13
+; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v13, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2]
+; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v17, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v16, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], 0, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v6
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v6
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT:    v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v8, v4
+; GISEL-NEXT:    v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v0, v9, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1]
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v9, v5
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v6, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v4
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v7, v9, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v7
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index c455b24313ddc2..83ebc84e1f84a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
+; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], 0, 0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT:    v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v4
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v5, v3, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v10
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v6, v9, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v5, v3, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v6, v7, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v7, v1
+; GISEL-NEXT:    v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v0
-; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v16, s[4:5], 0, v2
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v4
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT:    v_sub_i32_e64 v15, s[4:5], 0, v2
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v19, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v0, v18, v5
-; GISEL-NEXT:    v_mul_lo_u32 v19, v15, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v13, v3, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v18, v0, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v4
+; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v5
+; GISEL-NEXT:    v_mul_hi_u32 v19, v14, v4
+; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v10, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v18, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v19, v0
-; GISEL-NEXT:    v_mul_hi_u32 v19, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v17, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v19, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT:    v_mul_hi_u32 v6, v18, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v0
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v18, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0
-; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v10, v1
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1]
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v10, v18, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v15, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v5
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], 0, v12
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v0
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_mul_hi_u32 v5, v17, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v18, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v11, v1
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v18, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v13, v10, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v4
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], 0, v12
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v6
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v0
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v10, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v5
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v14, v4
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v0
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v0, v7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v0, v6
+; GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1]
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v8, v5
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v10, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v10, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v4
-; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v6, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v3, v2
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
+; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v6, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 61e1e67b7ae360..320dfbb4980e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX6-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX6-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX8-NEXT:    s_ashr_i32 s5, s7, 15
-; GFX8-NEXT:    s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT:    s_addk_i32 s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9-LABEL: ssubsat_i48_vs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX6-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX8-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX9-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v2, v6
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT:    v_add_co_u32 v1, s5, 0x80000000, v12
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT:    v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v4
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s9
-; GFX10-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX10-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX10-NEXT:    s_subb_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
 ; GFX11-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX11-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX11-NEXT:    s_sub_u32 s0, s2, s6
 ; GFX11-NEXT:    s_subb_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
@@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
@@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
@@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s11
@@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX11-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
@@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s4
-; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc
@@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_add_u32_e32 v7, 0x80000000, v6
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
@@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
-; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v3, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s4
@@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v19
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
+; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_add_co_u32 v7, null, 0x80000000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x80000000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
-; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s0
@@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
@@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
@@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s17
@@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s19, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
@@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s8, s17, 31
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
-; GFX10-NEXT:    s_add_u32 s9, s8, 0x80000000
+; GFX10-NEXT:    s_add_i32 s9, s8, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX10-NEXT:    s_and_b32 s5, 1, s5
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
@@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
-; GFX11-NEXT:    s_add_u32 s9, s8, 0x80000000
+; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s18

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 887c43f5fce59e..d15551365707b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
@@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 5c6bb6dea16466..07480a0ce0c2e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v17, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
@@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10


        


More information about the llvm-commits mailing list