[llvm] [GlobalIsel] Combine ADDE (PR #82413)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 20 12:23:44 PST 2024


https://github.com/tschuett created https://github.com/llvm/llvm-project/pull/82413

Clang has them as builtins (__builtin_addc). The middle end has no intrinsics for them. They are used in legalization operations.

AArch64: ADCS Add with carry and set flags

On Neoverse V2, they run at half the throughput of basic arithmetic and have a limited set of pipelines.

>From 5e74a5c86aacdff34011aca5c6ddf25f0c71c37f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Tue, 20 Feb 2024 15:34:11 +0100
Subject: [PATCH] [GlobalIsel] Combine ADDE

Clang has them as builtins (__builtin_addc). The middle end has no
intrinsics for them. They are used in legalization operations.

AArch64: ADCS Add with carry and set flags

On Neoverse V2, they run at half the throughput of basic arithmetic
and have a limited set of pipelines.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   10 +-
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |   17 +
 .../include/llvm/Target/GlobalISel/Combine.td |    8 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  209 ++
 .../AArch64/GlobalISel/combine-adde.mir       |  300 +++
 llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll   |   48 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    | 1745 ++++++++++-------
 .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll |  975 ++++++---
 8 files changed, 2335 insertions(+), 977 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 23728636498ba0..abc2ebdfd878c2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -810,12 +810,15 @@ class CombinerHelper {
   /// Combine selects.
   bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine ands,
+  /// Combine ands.
   bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
 
-  /// Combine ors,
+  /// Combine ors.
   bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Combine addes.
+  bool matchAddCarryInOut(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -919,6 +922,7 @@ class CombinerHelper {
   bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
   bool isConstantSplatVector(Register Src, int64_t SplatValue,
                              bool AllowUndefs);
+  bool isConstantOrConstantVectorI(Register Src);
 
   std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
 
@@ -930,6 +934,8 @@ class CombinerHelper {
 
   // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
   bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
+
+  bool isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const;
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index f5a6528d10a973..e46d2d1aac0e86 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr {
   Register getCarryOutReg() const { return getReg(1); }
   MachineOperand &getLHS() { return getOperand(2); }
   MachineOperand &getRHS() { return getOperand(3); }
+  Register getLHSReg() { return getOperand(2).getReg(); }
+  Register getRHSReg() { return getOperand(3).getReg(); }
 
   static bool classof(const MachineInstr *MI) {
     switch (MI->getOpcode()) {
@@ -448,6 +450,21 @@ class GAddSubCarryInOut : public GAddSubCarryOut {
   }
 };
 
+/// Represents overflowing add operations that also consume a carry-in.
+/// G_UADDE, G_SADDE
+class GAddCarryInOut : public GAddSubCarryInOut {
+public:
+  static bool classof(const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_UADDE:
+    case TargetOpcode::G_SADDE:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
 /// Represents a call to an intrinsic.
 class GIntrinsic final : public GenericMachineInstr {
 public:
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 7eadb718f16415..3a82bc14885beb 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1253,6 +1253,12 @@ def match_ors : GICombineRule<
         [{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+def match_addes : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SADDE, G_UADDE):$root,
+        [{ return Helper.matchAddCarryInOut(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // Combines concat operations
 def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 def combine_concat_vector : GICombineRule<
@@ -1335,7 +1341,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, 
-    combine_concat_vector]>;
+    combine_concat_vector, match_addes]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 779ec49f4d13a7..2cfc7387ed976d 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6342,6 +6342,23 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
   return Value;
 }
 
+bool CombinerHelper::isConstantOrConstantVectorI(Register Src) {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return true;
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+  unsigned NumSources = BuildVector->getNumSources();
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return false;
+  }
+  return true; // FIXME: G_SPLAT_VECTOR
+}
+
 // TODO: use knownbits to determine zeros
 bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
                                               BuildFnTy &MatchInfo) {
@@ -6906,3 +6923,195 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
 
   return false;
 }
+
+bool CombinerHelper::isZExtOrTruncLegal(LLT ToTy, LLT FromTy) const {
+  // Copy.
+  if (ToTy == FromTy)
+    return true;
+
+  if (isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {ToTy, FromTy}}))
+    return true;
+
+  if (isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {ToTy, FromTy}}))
+    return true;
+
+  return false;
+}
+
+bool CombinerHelper::matchAddCarryInOut(MachineInstr &MI,
+                                        BuildFnTy &MatchInfo) {
+  GAddCarryInOut *Add = cast<GAddCarryInOut>(&MI);
+
+  // adde has no flags.
+  Register Dst = Add->getDstReg();
+  Register Carry = Add->getCarryOutReg();
+  Register CarryIn = Add->getCarryInReg();
+  Register LHS = Add->getLHSReg();
+  Register RHS = Add->getRHSReg();
+  bool IsSigned = Add->isSigned();
+  LLT DstTy = MRI.getType(Dst);
+  LLT CarryTy = MRI.getType(Carry);
+  LLT OperandTy = MRI.getType(LHS);
+  LLT CarryInTy = MRI.getType(CarryIn);
+
+  // FIXME: handle undef
+
+  // fold sadde, if the carry is dead -> add(add(LHS, RHS),
+  // zextOrTrunc(CarryIn)), undef.
+  if (MRI.use_nodbg_empty(Carry) && IsSigned && MRI.hasOneNonDBGUse(Dst) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) &&
+      isZExtOrTruncLegal(DstTy, CarryInTy)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto A = B.buildAdd(DstTy, LHS, RHS);
+      Register AReg = A.getReg(0);
+      auto ZextCarryIn = B.buildZExtOrTrunc(DstTy, CarryIn);
+      Register ZextCarryInReg = ZextCarryIn.getReg(0);
+      B.buildAdd(Dst, AReg, ZextCarryInReg);
+      B.buildUndef(Carry);
+    };
+    return true;
+  }
+
+  // We want do fold the [u|s]adde.
+  if (!MRI.hasOneNonDBGUse(Dst) || !MRI.hasOneNonDBGUse(Carry))
+    return false;
+
+  // The parameters of the adde must be integer-like.
+  std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS);
+  std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS);
+  std::optional<APInt> MaybeCarryIn = getConstantOrConstantSplatVector(CarryIn);
+
+  // fold adde(c, c, c) -> c, carry
+  if (MaybeLHS && MaybeRHS && MaybeCarryIn &&
+      isConstantLegalOrBeforeLegalizer(DstTy) &&
+      isConstantLegalOrBeforeLegalizer(CarryTy)) {
+    // They must all have the same bitwidth. Otherwise APInt might
+    // assert. Prelegalization, they may have widely different bitwidths.
+    unsigned BitWidth =
+        std::max(std::max(MaybeLHS->getBitWidth(), MaybeRHS->getBitWidth()),
+                 MaybeCarryIn->getBitWidth());
+    if (IsSigned) {
+      APInt LHS = MaybeLHS->sext(BitWidth);
+      APInt RHS = MaybeRHS->sext(BitWidth);
+      APInt CarryIn = MaybeCarryIn->zext(BitWidth);
+      bool FirstOverflowed = false;
+      bool SecondOverflowed = false;
+      APInt Result =
+          LHS.sadd_ov(RHS, FirstOverflowed).sadd_ov(CarryIn, SecondOverflowed);
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildConstant(Dst, Result);
+        B.buildConstant(Carry, FirstOverflowed | SecondOverflowed);
+      };
+      return true;
+    } else if (!IsSigned) {
+      APInt LHS = MaybeLHS->zext(BitWidth);
+      APInt RHS = MaybeRHS->zext(BitWidth);
+      APInt CarryIn = MaybeCarryIn->zext(BitWidth);
+      bool FirstOverflowed = false;
+      bool SecondOverflowed = false;
+      APInt Result =
+          LHS.uadd_ov(RHS, FirstOverflowed).uadd_ov(CarryIn, SecondOverflowed);
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildConstant(Dst, Result);
+        B.buildConstant(Carry, FirstOverflowed | SecondOverflowed);
+      };
+      return true;
+    }
+  }
+
+  // canonicalize constant to RHS.
+  if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) {
+    if (IsSigned) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSAdde(Dst, Carry, RHS, LHS, CarryIn);
+      };
+      return true;
+    } else {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildUAdde(Dst, Carry, RHS, LHS, CarryIn);
+      };
+      return true;
+    }
+  }
+
+  // fold adde(LHS, RHS, 0) -> addo(LHS, RHS)
+  if (MaybeCarryIn && *MaybeCarryIn == 0) {
+    if (IsSigned && isLegalOrBeforeLegalizer(
+                        {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}})) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildSAddo(Dst, Carry, LHS, RHS);
+      };
+      return true;
+    } else if (!IsSigned &&
+               isLegalOrBeforeLegalizer(
+                   {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}))
+      MatchInfo = [=](MachineIRBuilder &B) {
+        B.buildUAddo(Dst, Carry, LHS, RHS);
+      };
+    return true;
+  }
+
+  // fold adde(LHS, 0, Carry) -> addo(LHS, Carry)
+  if (MaybeRHS && *MaybeRHS == 0) {
+    if (IsSigned &&
+        isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) &&
+        isZExtOrTruncLegal(OperandTy, CarryInTy)) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn);
+        Register ZextCarryInReg = ZextCarryIn.getReg(0);
+        B.buildSAddo(Dst, Carry, LHS, ZextCarryInReg);
+      };
+      return true;
+    } else if (!IsSigned &&
+               isLegalOrBeforeLegalizer(
+                   {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) &&
+               isZExtOrTruncLegal(OperandTy, CarryInTy)) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn);
+        Register ZextCarryInReg = ZextCarryIn.getReg(0);
+        B.buildUAddo(Dst, Carry, LHS, ZextCarryInReg);
+      };
+      return true;
+    }
+  }
+
+  // We lower to 2*addo + 1*or.
+  if (IsSigned &&
+      isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SADDO, {DstTy, CarryTy, OperandTy}}) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) &&
+      isZExtOrTruncLegal(OperandTy, CarryInTy)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto First = B.buildSAddo(DstTy, CarryTy, LHS, RHS);
+      Register FirstResult = First.getReg(0);
+      Register FirstCarry = First.getReg(1);
+      auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn);
+      auto Second = B.buildSAddo(DstTy, CarryTy, FirstResult, ZextCarryIn);
+      Register Result = Second.getReg(0);
+      Register SecondCarry = Second.getReg(1);
+      B.buildCopy(Dst, Result);
+      B.buildOr(Carry, FirstCarry, SecondCarry);
+    };
+    return true;
+  } else if (!IsSigned &&
+             isLegalOrBeforeLegalizer(
+                 {TargetOpcode::G_UADDO, {DstTy, CarryTy, OperandTy}}) &&
+             isLegalOrBeforeLegalizer({TargetOpcode::G_OR, {DstTy}}) &&
+             isZExtOrTruncLegal(OperandTy, CarryInTy)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto First = B.buildUAddo(DstTy, CarryTy, LHS, RHS);
+      Register FirstResult = First.getReg(0);
+      Register FirstCarry = First.getReg(1);
+      auto ZextCarryIn = B.buildZExtOrTrunc(OperandTy, CarryIn);
+      auto Second = B.buildUAddo(DstTy, CarryTy, FirstResult, ZextCarryIn);
+      Register Result = Second.getReg(0);
+      Register SecondCarry = Second.getReg(1);
+      B.buildCopy(Dst, Result);
+      B.buildOr(Carry, FirstCarry, SecondCarry);
+    };
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir
new file mode 100644
index 00000000000000..61c7f56f4b2605
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-adde.mir
@@ -0,0 +1,300 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+---
+# add, _ = sadde(_, _, In)
+name:            carryout_unused
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: carryout_unused
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1)
+    ; CHECK-NEXT: %add:_(s64) = G_ADD [[ADD]], [[ZEXT]]
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = COPY %3
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in
+    $x0 = COPY %add
+...
+---
+# add, _ = uadde(_, _, In)
+name:            carryout_unused_unsigned
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: carryout_unused_unsigned
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_UADDE [[COPY]], [[COPY]], %carry_in
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = COPY %3
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in
+    $x0 = COPY %add
+...
+---
+# add, multi_c = sadde(L, R, In)
+name:            multi_use_unsigned
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: multi_use_unsigned
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_UADDE [[COPY]], [[COPY]], %carry_in
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: %carry_out_ext2:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    ; CHECK-NEXT: $x2 = COPY %carry_out_ext2(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = COPY %3
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    %carry_out_ext2:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+    $x2 = COPY %carry_out_ext2
+...
+---
+# add, c = sadde(L, R, In)
+name:            constant_fold_signed
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constant_fold_signed
+    ; CHECK: %add:_(s64) = G_CONSTANT i64 29
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = G_CONSTANT i64 1
+    %lhs:_(s64) = G_CONSTANT i64 11
+    %rhs:_(s64) = G_CONSTANT i64 17
+    %carry_in:_(s1) = G_CONSTANT i1 1
+    %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = uadde(L, R, In)
+name:            constant_fold_unsigned
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constant_fold_unsigned
+    ; CHECK: %add:_(s64) = G_CONSTANT i64 27
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = G_CONSTANT i64 1
+    %lhs:_(s64) = G_CONSTANT i64 19
+    %rhs:_(s64) = G_CONSTANT i64 7
+    %carry_in:_(s1) = G_CONSTANT i1 1
+    %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = uadde(L, R, In)
+name:            canonicalize_to_rhs_plus_lower
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: canonicalize_to_rhs_plus_lower
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %lhs:_(s64) = G_CONSTANT i64 19
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[COPY]], %lhs
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1)
+    ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[ZEXT]]
+    ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[UADDO1]], [[UADDO3]]
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: $x0 = COPY [[UADDO2]](s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = G_CONSTANT i64 19
+    %rhs:_(s64) = COPY %3
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = sadde(L, R, 0)
+name:            fold_to_addo_l_r
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: fold_to_addo_l_r
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_SADDO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = COPY %4
+    %carry_in:_(s1) = G_CONSTANT i1 0
+    %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = sadde(L, 0, CarryIn)
+name:            fold_to_addo_l_carryin
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: fold_to_addo_l_carryin
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1)
+    ; CHECK-NEXT: %add:_(s64), %carry_out:_(s1) = G_SADDO [[COPY]], [[ZEXT]]
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = G_CONSTANT i64 0
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = sadde(L, R, CarryIn)
+name:            fold_to_lower_signed
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: fold_to_lower_signed
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[SADDO:%[0-9]+]]:_(s64), [[SADDO1:%[0-9]+]]:_(s1) = G_SADDO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1)
+    ; CHECK-NEXT: [[SADDO2:%[0-9]+]]:_(s64), [[SADDO3:%[0-9]+]]:_(s1) = G_SADDO [[SADDO]], [[ZEXT]]
+    ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[SADDO1]], [[SADDO3]]
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: $x0 = COPY [[SADDO2]](s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = COPY %4
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_SADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = uadde(L, R, CarryIn)
+name:            fold_to_lower_unsigned
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: fold_to_lower_unsigned
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %carry_in:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %carry_in(s1)
+    ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[ZEXT]]
+    ; CHECK-NEXT: %carry_out:_(s1) = G_OR [[UADDO1]], [[UADDO3]]
+    ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT %carry_out(s1)
+    ; CHECK-NEXT: $x0 = COPY [[UADDO2]](s64)
+    ; CHECK-NEXT: $x1 = COPY %carry_out_ext(s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %lhs:_(s64) = COPY %3
+    %rhs:_(s64) = COPY %4
+    %carry_in:_(s1) = G_TRUNC %4
+    %add:_(s64), %carry_out:_(s1) = G_UADDE %lhs, %rhs, %carry_in
+    %carry_out_ext:_(s64) = G_ANYEXT %carry_out
+    $x0 = COPY %add
+    $x1 = COPY %carry_out_ext
+...
+---
+# add, c = uadde(L, R, CarryIn)
+name:            fold_to_lower_vectorized
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: fold_to_lower_vectorized
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
+    ; CHECK-NEXT: %onebit:_(s1) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY3]](s64)
+    ; CHECK-NEXT: %carry_in:_(<2 x s1>) = G_BUILD_VECTOR %onebit(s1), %onebit(s1)
+    ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(<2 x s64>), [[UADDO1:%[0-9]+]]:_(<2 x s1>) = G_UADDO %lhs, %rhs
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s64>) = G_ZEXT %carry_in(<2 x s1>)
+    ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(<2 x s64>), [[UADDO3:%[0-9]+]]:_(<2 x s1>) = G_UADDO [[UADDO]], [[ZEXT]]
+    ; CHECK-NEXT: %carry_out:_(<2 x s1>) = G_OR [[UADDO1]], [[UADDO3]]
+    ; CHECK-NEXT: %zext:_(<2 x s64>) = G_ZEXT %carry_out(<2 x s1>)
+    ; CHECK-NEXT: $q0 = COPY %zext(<2 x s64>)
+    ; CHECK-NEXT: $q0 = COPY [[UADDO2]](<2 x s64>)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s64) = COPY $x4
+    %onebit:_(s1) = G_TRUNC %4
+    %lhs:_(<2 x s64>) = G_BUILD_VECTOR %0(s64), %1(s64)
+    %rhs:_(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64)
+    %carry_in:_(<2 x s1>) = G_BUILD_VECTOR %onebit(s1), %onebit(s1)
+    %add:_(<2 x s64>), %carry_out:_(<2 x s1>) = G_UADDE %lhs, %rhs, %carry_in
+    %zext:_(<2 x s64>) = G_ZEXT %carry_out(<2 x s1>)
+    $q0 = COPY %zext
+    $q0 = COPY %add
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index ff5880819020da..f337e6cf55292e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -39,9 +39,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
 ; GFX7-LABEL: v_uaddo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v2
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX7-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v2
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -49,9 +52,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
 ; GFX8-LABEL: v_uaddo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -59,9 +65,12 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
 ; GFX9-LABEL: v_uaddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[4:5], v1, v2
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -477,8 +486,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_uaddo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s0, s0, s2
-; GFX7-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_add_u32 s1, s1, s3
+; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX7-NEXT:    s_add_u32 s1, s1, s2
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_or_b32 s2, s3, s2
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
 ; GFX7-NEXT:    s_add_u32 s0, s0, s2
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -486,8 +500,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-LABEL: s_uaddo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
-; GFX8-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_or_b32 s2, s3, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -495,8 +514,13 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-LABEL: s_uaddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 4c1935d06517e5..eff845a146aceb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1084,7 +1084,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s16, v1
 ; GFX7-NEXT:    s_mul_i32 s18, s1, s8
-; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX7-NEXT:    s_add_u32 s18, s18, s17
 ; GFX7-NEXT:    s_addc_u32 s17, s23, s22
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s11
@@ -1095,33 +1095,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX7-NEXT:    v_readfirstlane_b32 s28, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_readfirstlane_b32 s27, v5
+; GFX7-NEXT:    v_readfirstlane_b32 s25, v5
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v3, s9
-; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX7-NEXT:    s_add_u32 s24, s24, s22
-; GFX7-NEXT:    s_addc_u32 s23, s27, s23
+; GFX7-NEXT:    s_addc_u32 s23, s25, s23
 ; GFX7-NEXT:    v_readfirstlane_b32 s29, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v5, s8
-; GFX7-NEXT:    s_mul_i32 s27, s2, s10
+; GFX7-NEXT:    s_mul_i32 s25, s2, s10
 ; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX7-NEXT:    s_add_u32 s24, s27, s24
+; GFX7-NEXT:    s_add_u32 s24, s25, s24
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s10
-; GFX7-NEXT:    s_addc_u32 s27, s28, s23
+; GFX7-NEXT:    s_addc_u32 s25, s28, s23
 ; GFX7-NEXT:    s_mul_i32 s28, s3, s9
 ; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX7-NEXT:    s_add_u32 s28, s28, s24
 ; GFX7-NEXT:    v_readfirstlane_b32 s30, v6
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s16, v4
-; GFX7-NEXT:    s_addc_u32 s27, s29, s27
+; GFX7-NEXT:    s_addc_u32 s25, s29, s25
 ; GFX7-NEXT:    s_mul_i32 s29, s4, s8
 ; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX7-NEXT:    s_add_u32 s28, s29, s28
 ; GFX7-NEXT:    v_readfirstlane_b32 s33, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v2, s9
-; GFX7-NEXT:    s_addc_u32 s27, s30, s27
+; GFX7-NEXT:    s_addc_u32 s29, s30, s25
 ; GFX7-NEXT:    s_mul_i32 s30, s16, s11
-; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s31, v6
 ; GFX7-NEXT:    s_add_u32 s19, s30, s19
 ; GFX7-NEXT:    s_addc_u32 s28, s31, s28
@@ -1139,88 +1139,93 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s33, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    s_add_u32 s19, s34, s19
-; GFX7-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX7-NEXT:    s_addc_u32 s28, s35, s28
-; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX7-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX7-NEXT:    s_addc_u32 s19, s25, s19
-; GFX7-NEXT:    v_mov_b32_e32 v2, s13
-; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX7-NEXT:    s_add_u32 s19, s26, s19
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX7-NEXT:    s_add_u32 s19, s19, s27
+; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX7-NEXT:    s_or_b32 s26, s26, s27
+; GFX7-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
+; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX7-NEXT:    s_addc_u32 s20, s20, 0
-; GFX7-NEXT:    v_readfirstlane_b32 s26, v0
+; GFX7-NEXT:    s_add_u32 s20, s20, s28
+; GFX7-NEXT:    v_mov_b32_e32 v2, s13
+; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX7-NEXT:    s_and_b32 s26, s26, 1
+; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
+; GFX7-NEXT:    s_add_u32 s20, s20, s26
+; GFX7-NEXT:    v_readfirstlane_b32 s27, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, s2, v1
-; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
-; GFX7-NEXT:    s_addc_u32 s20, s20, s28
-; GFX7-NEXT:    s_mul_i32 s25, s16, s14
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX7-NEXT:    s_or_b32 s21, s21, s26
+; GFX7-NEXT:    s_mul_i32 s26, s16, s14
 ; GFX7-NEXT:    s_mul_i32 s28, s1, s13
-; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s11
 ; GFX7-NEXT:    s_mul_i32 s28, s2, s12
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v5, s10
 ; GFX7-NEXT:    s_mul_i32 s28, s3, s11
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v0, s9
 ; GFX7-NEXT:    s_mul_i32 s28, s4, s10
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
 ; GFX7-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v6, s8
 ; GFX7-NEXT:    s_mul_i32 s28, s5, s9
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
 ; GFX7-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX7-NEXT:    v_readfirstlane_b32 s36, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, s2, v4
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    s_mul_i32 s28, s6, s8
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    s_mul_i32 s28, s16, s13
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v2
-; GFX7-NEXT:    s_add_u32 s27, s28, s27
+; GFX7-NEXT:    s_add_u32 s28, s28, s29
 ; GFX7-NEXT:    v_readfirstlane_b32 s37, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v3, s10
-; GFX7-NEXT:    s_addc_u32 s25, s35, s25
+; GFX7-NEXT:    s_addc_u32 s26, s35, s26
 ; GFX7-NEXT:    s_mul_i32 s35, s1, s12
-; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s35, s27
-; GFX7-NEXT:    s_addc_u32 s25, s36, s25
+; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX7-NEXT:    s_add_u32 s28, s35, s28
+; GFX7-NEXT:    s_addc_u32 s26, s36, s26
 ; GFX7-NEXT:    s_mul_i32 s36, s2, s11
 ; GFX7-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s36, s27
+; GFX7-NEXT:    s_add_u32 s28, s36, s28
 ; GFX7-NEXT:    v_readfirstlane_b32 s38, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v5, s9
-; GFX7-NEXT:    s_addc_u32 s25, s37, s25
+; GFX7-NEXT:    s_addc_u32 s26, s37, s26
 ; GFX7-NEXT:    s_mul_i32 s37, s3, s10
 ; GFX7-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s37, s27
+; GFX7-NEXT:    s_add_u32 s28, s37, s28
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s8
-; GFX7-NEXT:    s_addc_u32 s25, s38, s25
+; GFX7-NEXT:    s_addc_u32 s26, s38, s26
 ; GFX7-NEXT:    s_mul_i32 s38, s4, s9
 ; GFX7-NEXT:    s_cselect_b32 s37, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s39, v1
-; GFX7-NEXT:    s_add_u32 s27, s38, s27
-; GFX7-NEXT:    s_addc_u32 s25, s39, s25
+; GFX7-NEXT:    s_add_u32 s28, s38, s28
+; GFX7-NEXT:    s_addc_u32 s26, s39, s26
 ; GFX7-NEXT:    s_mul_i32 s39, s5, s8
 ; GFX7-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s40, v0
-; GFX7-NEXT:    s_add_u32 s27, s39, s27
-; GFX7-NEXT:    s_addc_u32 s25, s40, s25
+; GFX7-NEXT:    s_add_u32 s28, s39, s28
+; GFX7-NEXT:    s_addc_u32 s26, s40, s26
 ; GFX7-NEXT:    s_cselect_b32 s39, 1, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
@@ -1228,19 +1233,28 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s34, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX7-NEXT:    s_addc_u32 s21, s30, s27
-; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX7-NEXT:    s_add_u32 s28, s30, s28
+; GFX7-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX7-NEXT:    s_and_b32 s21, s21, 1
+; GFX7-NEXT:    s_add_u32 s21, s28, s21
+; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX7-NEXT:    s_or_b32 s28, s30, s28
 ; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX7-NEXT:    s_addc_u32 s22, s22, s25
+; GFX7-NEXT:    s_add_u32 s22, s22, s26
+; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX7-NEXT:    s_and_b32 s24, s28, 1
+; GFX7-NEXT:    s_add_u32 s22, s22, s24
+; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX7-NEXT:    s_or_b32 s23, s23, s24
 ; GFX7-NEXT:    s_mul_i32 s16, s16, s15
-; GFX7-NEXT:    s_addc_u32 s15, s26, s16
+; GFX7-NEXT:    s_and_b32 s15, s23, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX7-NEXT:    s_addc_u32 s15, s27, s16
 ; GFX7-NEXT:    s_mul_i32 s1, s1, s14
 ; GFX7-NEXT:    s_cmp_lg_u32 s39, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s15, s1
@@ -1257,7 +1271,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cmp_lg_u32 s35, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX7-NEXT:    s_mul_i32 s6, s6, s9
-; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX7-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s8
@@ -1305,7 +1319,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s16, v1
 ; GFX8-NEXT:    s_mul_i32 s18, s1, s8
-; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_add_u32 s18, s18, s17
 ; GFX8-NEXT:    s_addc_u32 s17, s23, s22
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s11
@@ -1316,33 +1330,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX8-NEXT:    v_readfirstlane_b32 s28, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s27, v5
+; GFX8-NEXT:    v_readfirstlane_b32 s25, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, s9
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX8-NEXT:    s_add_u32 s24, s24, s22
-; GFX8-NEXT:    s_addc_u32 s23, s27, s23
+; GFX8-NEXT:    s_addc_u32 s23, s25, s23
 ; GFX8-NEXT:    v_readfirstlane_b32 s29, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, s8
-; GFX8-NEXT:    s_mul_i32 s27, s2, s10
+; GFX8-NEXT:    s_mul_i32 s25, s2, s10
 ; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX8-NEXT:    s_add_u32 s24, s27, s24
+; GFX8-NEXT:    s_add_u32 s24, s25, s24
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s10
-; GFX8-NEXT:    s_addc_u32 s27, s28, s23
+; GFX8-NEXT:    s_addc_u32 s25, s28, s23
 ; GFX8-NEXT:    s_mul_i32 s28, s3, s9
 ; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX8-NEXT:    s_add_u32 s28, s28, s24
 ; GFX8-NEXT:    v_readfirstlane_b32 s30, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s16, v4
-; GFX8-NEXT:    s_addc_u32 s27, s29, s27
+; GFX8-NEXT:    s_addc_u32 s25, s29, s25
 ; GFX8-NEXT:    s_mul_i32 s29, s4, s8
 ; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_add_u32 s28, s29, s28
 ; GFX8-NEXT:    v_readfirstlane_b32 s33, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v2, s9
-; GFX8-NEXT:    s_addc_u32 s27, s30, s27
+; GFX8-NEXT:    s_addc_u32 s29, s30, s25
 ; GFX8-NEXT:    s_mul_i32 s30, s16, s11
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s31, v6
 ; GFX8-NEXT:    s_add_u32 s19, s30, s19
 ; GFX8-NEXT:    s_addc_u32 s28, s31, s28
@@ -1360,88 +1374,93 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s33, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    s_add_u32 s19, s34, s19
-; GFX8-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX8-NEXT:    s_addc_u32 s28, s35, s28
-; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX8-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_addc_u32 s19, s25, s19
-; GFX8-NEXT:    v_mov_b32_e32 v2, s13
-; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_add_u32 s19, s26, s19
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_add_u32 s19, s19, s27
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_or_b32 s26, s26, s27
+; GFX8-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
+; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX8-NEXT:    s_addc_u32 s20, s20, 0
-; GFX8-NEXT:    v_readfirstlane_b32 s26, v0
+; GFX8-NEXT:    s_add_u32 s20, s20, s28
+; GFX8-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX8-NEXT:    s_and_b32 s26, s26, 1
+; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
+; GFX8-NEXT:    s_add_u32 s20, s20, s26
+; GFX8-NEXT:    v_readfirstlane_b32 s27, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v1
-; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
-; GFX8-NEXT:    s_addc_u32 s20, s20, s28
-; GFX8-NEXT:    s_mul_i32 s25, s16, s14
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_or_b32 s21, s21, s26
+; GFX8-NEXT:    s_mul_i32 s26, s16, s14
 ; GFX8-NEXT:    s_mul_i32 s28, s1, s13
-; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s11
 ; GFX8-NEXT:    s_mul_i32 s28, s2, s12
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v5, s10
 ; GFX8-NEXT:    s_mul_i32 s28, s3, s11
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v0, s9
 ; GFX8-NEXT:    s_mul_i32 s28, s4, s10
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v6, s8
 ; GFX8-NEXT:    s_mul_i32 s28, s5, s9
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s36, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v4
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    s_mul_i32 s28, s6, s8
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    s_mul_i32 s28, s16, s13
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v2
-; GFX8-NEXT:    s_add_u32 s27, s28, s27
+; GFX8-NEXT:    s_add_u32 s28, s28, s29
 ; GFX8-NEXT:    v_readfirstlane_b32 s37, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v3, s10
-; GFX8-NEXT:    s_addc_u32 s25, s35, s25
+; GFX8-NEXT:    s_addc_u32 s26, s35, s26
 ; GFX8-NEXT:    s_mul_i32 s35, s1, s12
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s35, s27
-; GFX8-NEXT:    s_addc_u32 s25, s36, s25
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_add_u32 s28, s35, s28
+; GFX8-NEXT:    s_addc_u32 s26, s36, s26
 ; GFX8-NEXT:    s_mul_i32 s36, s2, s11
 ; GFX8-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s36, s27
+; GFX8-NEXT:    s_add_u32 s28, s36, s28
 ; GFX8-NEXT:    v_readfirstlane_b32 s38, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v5, s9
-; GFX8-NEXT:    s_addc_u32 s25, s37, s25
+; GFX8-NEXT:    s_addc_u32 s26, s37, s26
 ; GFX8-NEXT:    s_mul_i32 s37, s3, s10
 ; GFX8-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s37, s27
+; GFX8-NEXT:    s_add_u32 s28, s37, s28
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s8
-; GFX8-NEXT:    s_addc_u32 s25, s38, s25
+; GFX8-NEXT:    s_addc_u32 s26, s38, s26
 ; GFX8-NEXT:    s_mul_i32 s38, s4, s9
 ; GFX8-NEXT:    s_cselect_b32 s37, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s39, v1
-; GFX8-NEXT:    s_add_u32 s27, s38, s27
-; GFX8-NEXT:    s_addc_u32 s25, s39, s25
+; GFX8-NEXT:    s_add_u32 s28, s38, s28
+; GFX8-NEXT:    s_addc_u32 s26, s39, s26
 ; GFX8-NEXT:    s_mul_i32 s39, s5, s8
 ; GFX8-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s40, v0
-; GFX8-NEXT:    s_add_u32 s27, s39, s27
-; GFX8-NEXT:    s_addc_u32 s25, s40, s25
+; GFX8-NEXT:    s_add_u32 s28, s39, s28
+; GFX8-NEXT:    s_addc_u32 s26, s40, s26
 ; GFX8-NEXT:    s_cselect_b32 s39, 1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
@@ -1449,19 +1468,28 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s34, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_addc_u32 s21, s30, s27
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_add_u32 s28, s30, s28
+; GFX8-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX8-NEXT:    s_and_b32 s21, s21, 1
+; GFX8-NEXT:    s_add_u32 s21, s28, s21
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_or_b32 s28, s30, s28
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_addc_u32 s22, s22, s25
+; GFX8-NEXT:    s_add_u32 s22, s22, s26
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_and_b32 s24, s28, 1
+; GFX8-NEXT:    s_add_u32 s22, s22, s24
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_or_b32 s23, s23, s24
 ; GFX8-NEXT:    s_mul_i32 s16, s16, s15
-; GFX8-NEXT:    s_addc_u32 s15, s26, s16
+; GFX8-NEXT:    s_and_b32 s15, s23, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_addc_u32 s15, s27, s16
 ; GFX8-NEXT:    s_mul_i32 s1, s1, s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s39, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s15, s1
@@ -1478,7 +1506,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cmp_lg_u32 s35, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8-NEXT:    s_mul_i32 s6, s6, s9
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s8
@@ -1510,15 +1538,15 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
 ; GFX9-NEXT:    s_add_u32 s17, s22, s17
-; GFX9-NEXT:    s_addc_u32 s18, s23, s18
-; GFX9-NEXT:    s_mul_i32 s23, s1, s8
-; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_addc_u32 s22, s23, s18
+; GFX9-NEXT:    s_mul_i32 s18, s1, s8
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s24, s1, s8
-; GFX9-NEXT:    s_add_u32 s17, s23, s17
-; GFX9-NEXT:    s_addc_u32 s18, s24, s18
+; GFX9-NEXT:    s_add_u32 s18, s18, s17
+; GFX9-NEXT:    s_addc_u32 s17, s24, s22
 ; GFX9-NEXT:    s_mul_i32 s24, s16, s12
 ; GFX9-NEXT:    s_mul_i32 s26, s1, s11
-; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s25, s16, s12
 ; GFX9-NEXT:    s_mul_hi_u32 s27, s1, s11
 ; GFX9-NEXT:    s_add_u32 s24, s26, s24
@@ -1559,16 +1587,21 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s19, s34, s19
 ; GFX9-NEXT:    s_addc_u32 s24, s35, s24
 ; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX9-NEXT:    s_addc_u32 s19, s22, s19
+; GFX9-NEXT:    s_add_u32 s19, s23, s19
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_add_u32 s19, s19, s22
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_or_b32 s22, s23, s22
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_addc_u32 s20, s20, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX9-NEXT:    s_addc_u32 s20, s20, s24
+; GFX9-NEXT:    s_add_u32 s20, s20, s24
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX9-NEXT:    s_and_b32 s22, s22, 1
+; GFX9-NEXT:    s_add_u32 s20, s20, s22
+; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_or_b32 s21, s21, s22
 ; GFX9-NEXT:    s_mul_i32 s22, s16, s14
 ; GFX9-NEXT:    s_mul_i32 s24, s1, s13
-; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s14
 ; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s13
 ; GFX9-NEXT:    s_add_u32 s22, s24, s22
@@ -1629,18 +1662,27 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_addc_u32 s30, s30, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s34, 0
 ; GFX9-NEXT:    s_addc_u32 s30, s30, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_addc_u32 s21, s30, s24
+; GFX9-NEXT:    s_add_u32 s24, s30, s24
+; GFX9-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX9-NEXT:    s_and_b32 s21, s21, 1
+; GFX9-NEXT:    s_add_u32 s21, s24, s21
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_or_b32 s24, s30, s24
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX9-NEXT:    s_addc_u32 s26, s26, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_addc_u32 s26, s26, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
 ; GFX9-NEXT:    s_addc_u32 s26, s26, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX9-NEXT:    s_addc_u32 s22, s26, s22
+; GFX9-NEXT:    s_add_u32 s22, s26, s22
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_and_b32 s24, s24, 1
+; GFX9-NEXT:    s_add_u32 s22, s22, s24
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_or_b32 s24, s26, s24
 ; GFX9-NEXT:    s_mul_i32 s16, s16, s15
+; GFX9-NEXT:    s_and_b32 s15, s24, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_addc_u32 s15, s23, s16
 ; GFX9-NEXT:    s_mul_i32 s1, s1, s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s39, 0
@@ -1663,192 +1705,399 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s8
 ; GFX9-NEXT:    s_add_u32 s7, s7, s1
-; GFX9-NEXT:    s_mov_b32 s1, s17
-; GFX9-NEXT:    s_mov_b32 s2, s18
+; GFX9-NEXT:    s_mov_b32 s1, s18
+; GFX9-NEXT:    s_mov_b32 s2, s17
 ; GFX9-NEXT:    s_mov_b32 s3, s19
 ; GFX9-NEXT:    s_mov_b32 s4, s20
 ; GFX9-NEXT:    s_mov_b32 s5, s21
 ; GFX9-NEXT:    s_mov_b32 s6, s22
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: s_mul_i256:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_mul_i32 s17, s0, s10
-; GFX10PLUS-NEXT:    s_mul_i32 s19, s1, s9
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s18, s0, s10
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s20, s1, s9
-; GFX10PLUS-NEXT:    s_add_u32 s17, s19, s17
-; GFX10PLUS-NEXT:    s_addc_u32 s18, s20, s18
-; GFX10PLUS-NEXT:    s_mul_i32 s20, s2, s8
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s21, s2, s8
-; GFX10PLUS-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s17, s20, s17
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s16, s0, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
-; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s9
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s9
-; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s16, s21, s16
-; GFX10PLUS-NEXT:    s_addc_u32 s17, s22, s17
-; GFX10PLUS-NEXT:    s_mul_i32 s22, s1, s8
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s23, s1, s8
-; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s16, s22, s16
-; GFX10PLUS-NEXT:    s_addc_u32 s17, s23, s17
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s12
-; GFX10PLUS-NEXT:    s_mul_i32 s25, s1, s11
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s24, s0, s12
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s26, s1, s11
-; GFX10PLUS-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s25, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s24, s26, s24
-; GFX10PLUS-NEXT:    s_mul_i32 s26, s2, s10
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s27, s2, s10
-; GFX10PLUS-NEXT:    s_cselect_b32 s25, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s26, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s24, s27, s24
-; GFX10PLUS-NEXT:    s_mul_i32 s27, s3, s9
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s28, s3, s9
-; GFX10PLUS-NEXT:    s_cselect_b32 s26, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s27, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s24, s28, s24
-; GFX10PLUS-NEXT:    s_mul_i32 s28, s4, s8
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s29, s4, s8
-; GFX10PLUS-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s28, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s24, s29, s24
-; GFX10PLUS-NEXT:    s_mul_i32 s29, s0, s11
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s30, s0, s11
-; GFX10PLUS-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s18, s29, s18
-; GFX10PLUS-NEXT:    s_addc_u32 s23, s30, s23
-; GFX10PLUS-NEXT:    s_mul_i32 s30, s1, s10
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s31, s1, s10
-; GFX10PLUS-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s18, s30, s18
-; GFX10PLUS-NEXT:    s_addc_u32 s23, s31, s23
-; GFX10PLUS-NEXT:    s_mul_i32 s31, s2, s9
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s33, s2, s9
-; GFX10PLUS-NEXT:    s_cselect_b32 s30, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s18, s31, s18
-; GFX10PLUS-NEXT:    s_addc_u32 s23, s33, s23
-; GFX10PLUS-NEXT:    s_mul_i32 s33, s3, s8
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s8
-; GFX10PLUS-NEXT:    s_cselect_b32 s31, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s18, s33, s18
-; GFX10PLUS-NEXT:    s_addc_u32 s23, s34, s23
-; GFX10PLUS-NEXT:    s_cselect_b32 s33, 1, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s14
-; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
-; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s1, s13
-; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s14
-; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, s23
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s1, s13
-; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s2, s12
-; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s2, s12
-; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s3, s11
-; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s11
-; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s4, s10
-; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s4, s10
-; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s5, s9
-; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s5, s9
-; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s6, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s6, s8
-; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s13
-; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s0, s13
-; GFX10PLUS-NEXT:    s_add_u32 s23, s23, s24
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s34, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s34, s1, s12
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s35, s1, s12
-; GFX10PLUS-NEXT:    s_cselect_b32 s24, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s34, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s35, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s35, s2, s11
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s36, s2, s11
-; GFX10PLUS-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s35, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s36, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s36, s3, s10
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s37, s3, s10
-; GFX10PLUS-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s36, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s37, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s37, s4, s9
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s38, s4, s9
-; GFX10PLUS-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s37, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s38, s21
-; GFX10PLUS-NEXT:    s_mul_i32 s38, s5, s8
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s39, s5, s8
-; GFX10PLUS-NEXT:    s_cselect_b32 s37, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s23, s38, s23
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s39, s21
-; GFX10PLUS-NEXT:    s_cselect_b32 s38, 1, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s14
-; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s31, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s13
-; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s33, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s12
-; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s4, s4, s11
-; GFX10PLUS-NEXT:    s_addc_u32 s20, s29, s23
-; GFX10PLUS-NEXT:    s_cselect_b32 s23, 1, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s26, s0, s15
-; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s5, s5, s10
-; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s6, s6, s9
-; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s7, s7, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s15, s25, s21
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s22, s26
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s38, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s21, s1
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s37, 0
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s2
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s36, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s2, s17
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s35, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s3, s18
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s4
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s34, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s4, s19
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s5, s20
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s6
-; GFX10PLUS-NEXT:    s_mov_b32 s6, s15
-; GFX10PLUS-NEXT:    s_add_i32 s7, s1, s7
-; GFX10PLUS-NEXT:    s_mov_b32 s1, s16
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_mul_i256:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mul_i32 s17, s0, s10
+; GFX10-NEXT:    s_mul_i32 s19, s1, s9
+; GFX10-NEXT:    s_mul_hi_u32 s18, s0, s10
+; GFX10-NEXT:    s_mul_hi_u32 s20, s1, s9
+; GFX10-NEXT:    s_add_u32 s17, s19, s17
+; GFX10-NEXT:    s_addc_u32 s18, s20, s18
+; GFX10-NEXT:    s_mul_i32 s20, s2, s8
+; GFX10-NEXT:    s_mul_hi_u32 s21, s2, s8
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_add_u32 s17, s20, s17
+; GFX10-NEXT:    s_mul_hi_u32 s16, s0, s8
+; GFX10-NEXT:    s_addc_u32 s18, s21, s18
+; GFX10-NEXT:    s_mul_i32 s21, s0, s9
+; GFX10-NEXT:    s_mul_hi_u32 s22, s0, s9
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_add_u32 s16, s21, s16
+; GFX10-NEXT:    s_addc_u32 s21, s22, s17
+; GFX10-NEXT:    s_mul_i32 s17, s1, s8
+; GFX10-NEXT:    s_mul_hi_u32 s23, s1, s8
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_add_u32 s17, s17, s16
+; GFX10-NEXT:    s_addc_u32 s16, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s0, s12
+; GFX10-NEXT:    s_mul_i32 s25, s1, s11
+; GFX10-NEXT:    s_mul_hi_u32 s24, s0, s12
+; GFX10-NEXT:    s_mul_hi_u32 s26, s1, s11
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s25, s23
+; GFX10-NEXT:    s_addc_u32 s24, s26, s24
+; GFX10-NEXT:    s_mul_i32 s26, s2, s10
+; GFX10-NEXT:    s_mul_hi_u32 s27, s2, s10
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s26, s23
+; GFX10-NEXT:    s_addc_u32 s24, s27, s24
+; GFX10-NEXT:    s_mul_i32 s27, s3, s9
+; GFX10-NEXT:    s_mul_hi_u32 s28, s3, s9
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s27, s23
+; GFX10-NEXT:    s_addc_u32 s24, s28, s24
+; GFX10-NEXT:    s_mul_i32 s28, s4, s8
+; GFX10-NEXT:    s_mul_hi_u32 s29, s4, s8
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s28, s23
+; GFX10-NEXT:    s_addc_u32 s24, s29, s24
+; GFX10-NEXT:    s_mul_i32 s29, s0, s11
+; GFX10-NEXT:    s_mul_hi_u32 s30, s0, s11
+; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_add_u32 s18, s29, s18
+; GFX10-NEXT:    s_addc_u32 s23, s30, s23
+; GFX10-NEXT:    s_mul_i32 s30, s1, s10
+; GFX10-NEXT:    s_mul_hi_u32 s31, s1, s10
+; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX10-NEXT:    s_add_u32 s18, s30, s18
+; GFX10-NEXT:    s_addc_u32 s23, s31, s23
+; GFX10-NEXT:    s_mul_i32 s31, s2, s9
+; GFX10-NEXT:    s_mul_hi_u32 s33, s2, s9
+; GFX10-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX10-NEXT:    s_add_u32 s18, s31, s18
+; GFX10-NEXT:    s_addc_u32 s23, s33, s23
+; GFX10-NEXT:    s_mul_i32 s33, s3, s8
+; GFX10-NEXT:    s_mul_hi_u32 s34, s3, s8
+; GFX10-NEXT:    s_cselect_b32 s31, 1, 0
+; GFX10-NEXT:    s_add_u32 s18, s33, s18
+; GFX10-NEXT:    s_addc_u32 s23, s34, s23
+; GFX10-NEXT:    s_cselect_b32 s33, 1, 0
+; GFX10-NEXT:    s_add_u32 s18, s22, s18
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_add_u32 s18, s18, s21
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX10-NEXT:    s_or_b32 s21, s22, s21
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_mul_hi_u32 s22, s0, s14
+; GFX10-NEXT:    s_addc_u32 s19, s19, 0
+; GFX10-NEXT:    s_mul_hi_u32 s35, s1, s12
+; GFX10-NEXT:    s_add_u32 s19, s19, s23
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_and_b32 s21, s21, 1
+; GFX10-NEXT:    s_mul_i32 s23, s1, s13
+; GFX10-NEXT:    s_add_u32 s19, s19, s21
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_mul_hi_u32 s36, s2, s11
+; GFX10-NEXT:    s_or_b32 s20, s20, s21
+; GFX10-NEXT:    s_mul_i32 s21, s0, s14
+; GFX10-NEXT:    s_mul_hi_u32 s37, s3, s10
+; GFX10-NEXT:    s_add_u32 s21, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s2, s12
+; GFX10-NEXT:    s_addc_u32 s22, s34, s22
+; GFX10-NEXT:    s_mul_hi_u32 s34, s2, s12
+; GFX10-NEXT:    s_add_u32 s21, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s3, s11
+; GFX10-NEXT:    s_addc_u32 s22, s34, s22
+; GFX10-NEXT:    s_mul_hi_u32 s34, s3, s11
+; GFX10-NEXT:    s_add_u32 s21, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s4, s10
+; GFX10-NEXT:    s_addc_u32 s22, s34, s22
+; GFX10-NEXT:    s_mul_hi_u32 s34, s4, s10
+; GFX10-NEXT:    s_add_u32 s21, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s5, s9
+; GFX10-NEXT:    s_addc_u32 s22, s34, s22
+; GFX10-NEXT:    s_mul_hi_u32 s34, s5, s9
+; GFX10-NEXT:    s_add_u32 s21, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s6, s8
+; GFX10-NEXT:    s_addc_u32 s22, s34, s22
+; GFX10-NEXT:    s_mul_hi_u32 s34, s6, s8
+; GFX10-NEXT:    s_add_u32 s21, s23, s21
+; GFX10-NEXT:    s_mul_i32 s23, s0, s13
+; GFX10-NEXT:    s_addc_u32 s22, s34, s22
+; GFX10-NEXT:    s_mul_hi_u32 s34, s0, s13
+; GFX10-NEXT:    s_add_u32 s23, s23, s24
+; GFX10-NEXT:    s_addc_u32 s21, s34, s21
+; GFX10-NEXT:    s_mul_i32 s34, s1, s12
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s34, s23
+; GFX10-NEXT:    s_addc_u32 s21, s35, s21
+; GFX10-NEXT:    s_mul_i32 s35, s2, s11
+; GFX10-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s35, s23
+; GFX10-NEXT:    s_addc_u32 s21, s36, s21
+; GFX10-NEXT:    s_mul_i32 s36, s3, s10
+; GFX10-NEXT:    s_cselect_b32 s35, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s36, s23
+; GFX10-NEXT:    s_addc_u32 s21, s37, s21
+; GFX10-NEXT:    s_mul_i32 s37, s4, s9
+; GFX10-NEXT:    s_mul_hi_u32 s38, s4, s9
+; GFX10-NEXT:    s_cselect_b32 s36, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s37, s23
+; GFX10-NEXT:    s_addc_u32 s21, s38, s21
+; GFX10-NEXT:    s_mul_i32 s38, s5, s8
+; GFX10-NEXT:    s_mul_hi_u32 s39, s5, s8
+; GFX10-NEXT:    s_cselect_b32 s37, 1, 0
+; GFX10-NEXT:    s_add_u32 s23, s38, s23
+; GFX10-NEXT:    s_addc_u32 s21, s39, s21
+; GFX10-NEXT:    s_cselect_b32 s38, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX10-NEXT:    s_mul_i32 s15, s0, s15
+; GFX10-NEXT:    s_addc_u32 s29, s29, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s31, 0
+; GFX10-NEXT:    s_mul_i32 s1, s1, s14
+; GFX10-NEXT:    s_addc_u32 s29, s29, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX10-NEXT:    s_mul_i32 s2, s2, s13
+; GFX10-NEXT:    s_addc_u32 s29, s29, 0
+; GFX10-NEXT:    s_mul_i32 s3, s3, s12
+; GFX10-NEXT:    s_add_u32 s23, s29, s23
+; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX10-NEXT:    s_and_b32 s20, s20, 1
+; GFX10-NEXT:    s_mul_i32 s4, s4, s11
+; GFX10-NEXT:    s_add_u32 s20, s23, s20
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mul_i32 s5, s5, s10
+; GFX10-NEXT:    s_or_b32 s23, s29, s23
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_mul_i32 s6, s6, s9
+; GFX10-NEXT:    s_addc_u32 s25, s25, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_mul_i32 s7, s7, s8
+; GFX10-NEXT:    s_addc_u32 s25, s25, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_mul_i32 s0, s0, s8
+; GFX10-NEXT:    s_addc_u32 s25, s25, 0
+; GFX10-NEXT:    s_add_u32 s21, s25, s21
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX10-NEXT:    s_and_b32 s23, s23, 1
+; GFX10-NEXT:    s_add_u32 s21, s21, s23
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_or_b32 s23, s25, s23
+; GFX10-NEXT:    s_and_b32 s23, s23, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    s_addc_u32 s15, s22, s15
+; GFX10-NEXT:    s_cmp_lg_u32 s38, 0
+; GFX10-NEXT:    s_addc_u32 s1, s15, s1
+; GFX10-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX10-NEXT:    s_addc_u32 s1, s1, s2
+; GFX10-NEXT:    s_cmp_lg_u32 s36, 0
+; GFX10-NEXT:    s_mov_b32 s2, s16
+; GFX10-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX10-NEXT:    s_mov_b32 s3, s18
+; GFX10-NEXT:    s_addc_u32 s1, s1, s4
+; GFX10-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX10-NEXT:    s_mov_b32 s4, s19
+; GFX10-NEXT:    s_addc_u32 s1, s1, s5
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX10-NEXT:    s_mov_b32 s5, s20
+; GFX10-NEXT:    s_addc_u32 s1, s1, s6
+; GFX10-NEXT:    s_mov_b32 s6, s21
+; GFX10-NEXT:    s_add_i32 s7, s1, s7
+; GFX10-NEXT:    s_mov_b32 s1, s17
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_mul_i256:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mul_i32 s17, s0, s10
+; GFX11-NEXT:    s_mul_i32 s19, s1, s9
+; GFX11-NEXT:    s_mul_hi_u32 s18, s0, s10
+; GFX11-NEXT:    s_mul_hi_u32 s20, s1, s9
+; GFX11-NEXT:    s_add_u32 s17, s19, s17
+; GFX11-NEXT:    s_addc_u32 s18, s20, s18
+; GFX11-NEXT:    s_mul_i32 s20, s2, s8
+; GFX11-NEXT:    s_mul_hi_u32 s21, s2, s8
+; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_add_u32 s17, s20, s17
+; GFX11-NEXT:    s_mul_hi_u32 s16, s0, s8
+; GFX11-NEXT:    s_addc_u32 s18, s21, s18
+; GFX11-NEXT:    s_mul_i32 s21, s0, s9
+; GFX11-NEXT:    s_mul_hi_u32 s22, s0, s9
+; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_add_u32 s16, s21, s16
+; GFX11-NEXT:    s_addc_u32 s17, s22, s17
+; GFX11-NEXT:    s_mul_i32 s22, s1, s8
+; GFX11-NEXT:    s_mul_hi_u32 s23, s1, s8
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_add_u32 s16, s22, s16
+; GFX11-NEXT:    s_addc_u32 s17, s23, s17
+; GFX11-NEXT:    s_mul_i32 s23, s0, s12
+; GFX11-NEXT:    s_mul_i32 s25, s1, s11
+; GFX11-NEXT:    s_mul_hi_u32 s24, s0, s12
+; GFX11-NEXT:    s_mul_hi_u32 s26, s1, s11
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s25, s23
+; GFX11-NEXT:    s_addc_u32 s24, s26, s24
+; GFX11-NEXT:    s_mul_i32 s26, s2, s10
+; GFX11-NEXT:    s_mul_hi_u32 s27, s2, s10
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s26, s23
+; GFX11-NEXT:    s_addc_u32 s24, s27, s24
+; GFX11-NEXT:    s_mul_i32 s27, s3, s9
+; GFX11-NEXT:    s_mul_hi_u32 s28, s3, s9
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s27, s23
+; GFX11-NEXT:    s_addc_u32 s24, s28, s24
+; GFX11-NEXT:    s_mul_i32 s28, s4, s8
+; GFX11-NEXT:    s_mul_hi_u32 s29, s4, s8
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s28, s23
+; GFX11-NEXT:    s_addc_u32 s24, s29, s24
+; GFX11-NEXT:    s_mul_i32 s29, s0, s11
+; GFX11-NEXT:    s_mul_hi_u32 s30, s0, s11
+; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_add_u32 s18, s29, s18
+; GFX11-NEXT:    s_addc_u32 s23, s30, s23
+; GFX11-NEXT:    s_mul_i32 s30, s1, s10
+; GFX11-NEXT:    s_mul_hi_u32 s31, s1, s10
+; GFX11-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX11-NEXT:    s_add_u32 s18, s30, s18
+; GFX11-NEXT:    s_addc_u32 s23, s31, s23
+; GFX11-NEXT:    s_mul_i32 s31, s2, s9
+; GFX11-NEXT:    s_mul_hi_u32 s33, s2, s9
+; GFX11-NEXT:    s_cselect_b32 s30, 1, 0
+; GFX11-NEXT:    s_add_u32 s18, s31, s18
+; GFX11-NEXT:    s_addc_u32 s23, s33, s23
+; GFX11-NEXT:    s_mul_i32 s33, s3, s8
+; GFX11-NEXT:    s_mul_hi_u32 s34, s3, s8
+; GFX11-NEXT:    s_cselect_b32 s31, 1, 0
+; GFX11-NEXT:    s_add_u32 s18, s33, s18
+; GFX11-NEXT:    s_addc_u32 s23, s34, s23
+; GFX11-NEXT:    s_cselect_b32 s33, 1, 0
+; GFX11-NEXT:    s_add_u32 s18, s21, s18
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_add_u32 s18, s18, s22
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX11-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX11-NEXT:    s_or_b32 s21, s21, s22
+; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-NEXT:    s_mul_hi_u32 s22, s0, s14
+; GFX11-NEXT:    s_addc_u32 s19, s19, 0
+; GFX11-NEXT:    s_mul_hi_u32 s35, s1, s12
+; GFX11-NEXT:    s_add_u32 s19, s19, s23
+; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_and_b32 s21, s21, 1
+; GFX11-NEXT:    s_mul_i32 s23, s1, s13
+; GFX11-NEXT:    s_add_u32 s19, s19, s21
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_mul_hi_u32 s36, s2, s11
+; GFX11-NEXT:    s_or_b32 s20, s20, s21
+; GFX11-NEXT:    s_mul_i32 s21, s0, s14
+; GFX11-NEXT:    s_mul_hi_u32 s37, s3, s10
+; GFX11-NEXT:    s_add_u32 s21, s23, s21
+; GFX11-NEXT:    s_mul_i32 s23, s2, s12
+; GFX11-NEXT:    s_addc_u32 s22, s34, s22
+; GFX11-NEXT:    s_mul_hi_u32 s34, s2, s12
+; GFX11-NEXT:    s_add_u32 s21, s23, s21
+; GFX11-NEXT:    s_mul_i32 s23, s3, s11
+; GFX11-NEXT:    s_addc_u32 s22, s34, s22
+; GFX11-NEXT:    s_mul_hi_u32 s34, s3, s11
+; GFX11-NEXT:    s_add_u32 s21, s23, s21
+; GFX11-NEXT:    s_mul_i32 s23, s4, s10
+; GFX11-NEXT:    s_addc_u32 s22, s34, s22
+; GFX11-NEXT:    s_mul_hi_u32 s34, s4, s10
+; GFX11-NEXT:    s_add_u32 s21, s23, s21
+; GFX11-NEXT:    s_mul_i32 s23, s5, s9
+; GFX11-NEXT:    s_addc_u32 s22, s34, s22
+; GFX11-NEXT:    s_mul_hi_u32 s34, s5, s9
+; GFX11-NEXT:    s_add_u32 s21, s23, s21
+; GFX11-NEXT:    s_mul_i32 s23, s6, s8
+; GFX11-NEXT:    s_addc_u32 s22, s34, s22
+; GFX11-NEXT:    s_mul_hi_u32 s34, s6, s8
+; GFX11-NEXT:    s_add_u32 s21, s23, s21
+; GFX11-NEXT:    s_mul_i32 s23, s0, s13
+; GFX11-NEXT:    s_addc_u32 s22, s34, s22
+; GFX11-NEXT:    s_mul_hi_u32 s34, s0, s13
+; GFX11-NEXT:    s_add_u32 s23, s23, s24
+; GFX11-NEXT:    s_addc_u32 s21, s34, s21
+; GFX11-NEXT:    s_mul_i32 s34, s1, s12
+; GFX11-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s34, s23
+; GFX11-NEXT:    s_addc_u32 s21, s35, s21
+; GFX11-NEXT:    s_mul_i32 s35, s2, s11
+; GFX11-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s35, s23
+; GFX11-NEXT:    s_addc_u32 s21, s36, s21
+; GFX11-NEXT:    s_mul_i32 s36, s3, s10
+; GFX11-NEXT:    s_cselect_b32 s35, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s36, s23
+; GFX11-NEXT:    s_addc_u32 s21, s37, s21
+; GFX11-NEXT:    s_mul_i32 s37, s4, s9
+; GFX11-NEXT:    s_mul_hi_u32 s38, s4, s9
+; GFX11-NEXT:    s_cselect_b32 s36, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s37, s23
+; GFX11-NEXT:    s_addc_u32 s21, s38, s21
+; GFX11-NEXT:    s_mul_i32 s38, s5, s8
+; GFX11-NEXT:    s_mul_hi_u32 s39, s5, s8
+; GFX11-NEXT:    s_cselect_b32 s37, 1, 0
+; GFX11-NEXT:    s_add_u32 s23, s38, s23
+; GFX11-NEXT:    s_addc_u32 s21, s39, s21
+; GFX11-NEXT:    s_cselect_b32 s38, 1, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX11-NEXT:    s_mul_i32 s15, s0, s15
+; GFX11-NEXT:    s_addc_u32 s29, s29, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s31, 0
+; GFX11-NEXT:    s_mul_i32 s1, s1, s14
+; GFX11-NEXT:    s_addc_u32 s29, s29, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX11-NEXT:    s_mul_i32 s2, s2, s13
+; GFX11-NEXT:    s_addc_u32 s29, s29, 0
+; GFX11-NEXT:    s_mul_i32 s3, s3, s12
+; GFX11-NEXT:    s_add_u32 s23, s29, s23
+; GFX11-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX11-NEXT:    s_and_b32 s20, s20, 1
+; GFX11-NEXT:    s_mul_i32 s4, s4, s11
+; GFX11-NEXT:    s_add_u32 s20, s23, s20
+; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_mul_i32 s5, s5, s10
+; GFX11-NEXT:    s_or_b32 s23, s29, s23
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_mul_i32 s6, s6, s9
+; GFX11-NEXT:    s_addc_u32 s25, s25, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_mul_i32 s7, s7, s8
+; GFX11-NEXT:    s_addc_u32 s25, s25, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_mul_i32 s0, s0, s8
+; GFX11-NEXT:    s_addc_u32 s25, s25, 0
+; GFX11-NEXT:    s_add_u32 s21, s25, s21
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX11-NEXT:    s_and_b32 s23, s23, 1
+; GFX11-NEXT:    s_add_u32 s21, s21, s23
+; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_or_b32 s23, s25, s23
+; GFX11-NEXT:    s_and_b32 s23, s23, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX11-NEXT:    s_addc_u32 s15, s22, s15
+; GFX11-NEXT:    s_cmp_lg_u32 s38, 0
+; GFX11-NEXT:    s_addc_u32 s1, s15, s1
+; GFX11-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX11-NEXT:    s_addc_u32 s1, s1, s2
+; GFX11-NEXT:    s_cmp_lg_u32 s36, 0
+; GFX11-NEXT:    s_mov_b32 s2, s17
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX11-NEXT:    s_mov_b32 s3, s18
+; GFX11-NEXT:    s_addc_u32 s1, s1, s4
+; GFX11-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX11-NEXT:    s_mov_b32 s4, s19
+; GFX11-NEXT:    s_addc_u32 s1, s1, s5
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX11-NEXT:    s_mov_b32 s5, s20
+; GFX11-NEXT:    s_addc_u32 s1, s1, s6
+; GFX11-NEXT:    s_mov_b32 s6, s21
+; GFX11-NEXT:    s_add_i32 s7, s1, s7
+; GFX11-NEXT:    s_mov_b32 s1, s16
+; GFX11-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i256:
 ; GFX12:       ; %bb.0:
@@ -1917,18 +2166,26 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_u32 s18, s33, s18
 ; GFX12-NEXT:    s_add_co_ci_u32 s23, s34, s23
 ; GFX12-NEXT:    s_cselect_b32 s33, 1, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s14
-; GFX12-NEXT:    s_add_co_ci_u32 s18, s21, s18
+; GFX12-NEXT:    s_add_co_u32 s18, s21, s18
 ; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX12-NEXT:    s_add_co_u32 s18, s18, s22
+; GFX12-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX12-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX12-NEXT:    s_or_b32 s21, s21, s22
+; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s14
 ; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX12-NEXT:    s_mul_i32 s21, s0, s14
-; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, s23
-; GFX12-NEXT:    s_mul_i32 s23, s1, s13
+; GFX12-NEXT:    s_mul_hi_u32 s35, s1, s12
+; GFX12-NEXT:    s_add_co_u32 s19, s19, s23
 ; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX12-NEXT:    s_and_b32 s21, s21, 1
+; GFX12-NEXT:    s_mul_i32 s23, s1, s13
+; GFX12-NEXT:    s_add_co_u32 s19, s19, s21
+; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX12-NEXT:    s_mul_hi_u32 s36, s2, s11
+; GFX12-NEXT:    s_or_b32 s20, s20, s21
+; GFX12-NEXT:    s_mul_i32 s21, s0, s14
+; GFX12-NEXT:    s_mul_hi_u32 s37, s3, s10
 ; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
 ; GFX12-NEXT:    s_mul_i32 s23, s2, s12
 ; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
@@ -1956,17 +2213,14 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_u32 s23, s23, s24
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s34, s21
 ; GFX12-NEXT:    s_mul_i32 s34, s1, s12
-; GFX12-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX12-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s23, s34, s23
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s35, s21
 ; GFX12-NEXT:    s_mul_i32 s35, s2, s11
-; GFX12-NEXT:    s_mul_hi_u32 s36, s2, s11
 ; GFX12-NEXT:    s_cselect_b32 s34, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s23, s35, s23
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s36, s21
 ; GFX12-NEXT:    s_mul_i32 s36, s3, s10
-; GFX12-NEXT:    s_mul_hi_u32 s37, s3, s10
 ; GFX12-NEXT:    s_cselect_b32 s35, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s23, s36, s23
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s37, s21
@@ -1982,34 +2236,46 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s39, s21
 ; GFX12-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX12-NEXT:    s_cmp_lg_u32 s30, 0
-; GFX12-NEXT:    s_mul_i32 s1, s1, s14
+; GFX12-NEXT:    s_mul_i32 s15, s0, s15
 ; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
 ; GFX12-NEXT:    s_cmp_lg_u32 s31, 0
-; GFX12-NEXT:    s_mul_i32 s2, s2, s13
+; GFX12-NEXT:    s_mul_i32 s1, s1, s14
 ; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
 ; GFX12-NEXT:    s_cmp_lg_u32 s33, 0
-; GFX12-NEXT:    s_mul_i32 s3, s3, s12
+; GFX12-NEXT:    s_mul_i32 s2, s2, s13
 ; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX12-NEXT:    s_mul_i32 s3, s3, s12
+; GFX12-NEXT:    s_add_co_u32 s23, s29, s23
+; GFX12-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX12-NEXT:    s_and_b32 s20, s20, 1
 ; GFX12-NEXT:    s_mul_i32 s4, s4, s11
-; GFX12-NEXT:    s_add_co_ci_u32 s20, s29, s23
+; GFX12-NEXT:    s_add_co_u32 s20, s23, s20
 ; GFX12-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX12-NEXT:    s_mul_i32 s5, s5, s10
+; GFX12-NEXT:    s_or_b32 s23, s29, s23
 ; GFX12-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX12-NEXT:    s_mul_i32 s26, s0, s15
+; GFX12-NEXT:    s_mul_i32 s6, s6, s9
 ; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
 ; GFX12-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX12-NEXT:    s_mul_i32 s5, s5, s10
+; GFX12-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
 ; GFX12-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX12-NEXT:    s_mul_i32 s6, s6, s9
+; GFX12-NEXT:    s_mul_i32 s0, s0, s8
 ; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_co_u32 s21, s25, s21
+; GFX12-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX12-NEXT:    s_and_b32 s23, s23, 1
+; GFX12-NEXT:    s_add_co_u32 s21, s21, s23
+; GFX12-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s23, s25, s23
+; GFX12-NEXT:    s_and_b32 s23, s23, 1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX12-NEXT:    s_mul_i32 s7, s7, s8
-; GFX12-NEXT:    s_add_co_ci_u32 s15, s25, s21
-; GFX12-NEXT:    s_add_co_ci_u32 s21, s22, s26
+; GFX12-NEXT:    s_add_co_ci_u32 s15, s22, s15
 ; GFX12-NEXT:    s_cmp_lg_u32 s38, 0
-; GFX12-NEXT:    s_mul_i32 s0, s0, s8
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s21, s1
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s15, s1
 ; GFX12-NEXT:    s_cmp_lg_u32 s37, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s2
 ; GFX12-NEXT:    s_cmp_lg_u32 s36, 0
@@ -2024,7 +2290,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX12-NEXT:    s_mov_b32 s5, s20
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s6
-; GFX12-NEXT:    s_mov_b32 s6, s15
+; GFX12-NEXT:    s_mov_b32 s6, s21
 ; GFX12-NEXT:    s_add_co_i32 s7, s1, s7
 ; GFX12-NEXT:    s_mov_b32 s1, s16
 ; GFX12-NEXT:    ; return to shader part epilog
@@ -2037,208 +2303,244 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-LABEL: v_mul_i256:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v16, v0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX7-NEXT:    v_mov_b32_e32 v17, v1
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v18, v23
-; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v0, v20
-; GFX7-NEXT:    v_mov_b32_e32 v1, v23
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX7-NEXT:    v_mul_lo_u32 v20, v6, v9
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v23, v5, v10
-; GFX7-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v13, v2, v13
-; GFX7-NEXT:    v_mov_b32_e32 v2, v22
-; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GFX7-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, v22, s[6:7]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17]
+; GFX7-NEXT:    v_mov_b32_e32 v16, v19
+; GFX7-NEXT:    v_mov_b32_e32 v17, v20
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17]
+; GFX7-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s[22:23]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20]
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[14:15]
+; GFX7-NEXT:    v_addc_u32_e64 v25, s[10:11], 0, v25, s[10:11]
+; GFX7-NEXT:    v_add_i32_e64 v23, s[14:15], v23, v19
+; GFX7-NEXT:    v_add_i32_e64 v24, s[16:17], v22, v20
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0
+; GFX7-NEXT:    v_addc_u32_e64 v25, s[12:13], 0, v25, s[12:13]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20]
+; GFX7-NEXT:    v_addc_u32_e64 v25, s[6:7], 0, v25, s[6:7]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20]
+; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20]
+; GFX7-NEXT:    v_mov_b32_e32 v22, v19
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22]
+; GFX7-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX7-NEXT:    v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22]
+; GFX7-NEXT:    v_addc_u32_e64 v19, s[4:5], 0, v19, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22]
+; GFX7-NEXT:    v_addc_u32_e64 v19, s[8:9], 0, v19, s[8:9]
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22]
 ; GFX7-NEXT:    v_mul_lo_u32 v12, v3, v12
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
-; GFX7-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX7-NEXT:    v_mul_lo_u32 v10, v16, v15
-; GFX7-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX7-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT:    v_mul_lo_u32 v11, v4, v11
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22]
+; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22]
+; GFX7-NEXT:    v_add_i32_e64 v21, s[6:7], v25, v21
+; GFX7-NEXT:    v_add_i32_e64 v19, s[20:21], v19, v22
+; GFX7-NEXT:    v_mul_lo_u32 v22, v6, v9
+; GFX7-NEXT:    v_mul_lo_u32 v25, v5, v10
+; GFX7-NEXT:    v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[22:23]
+; GFX7-NEXT:    v_add_i32_e64 v3, s[22:23], v23, v3
+; GFX7-NEXT:    s_or_b64 s[14:15], s[14:15], s[22:23]
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[14:15]
+; GFX7-NEXT:    v_add_i32_e64 v4, s[14:15], v24, v4
+; GFX7-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[14:15]
+; GFX7-NEXT:    v_add_i32_e64 v5, s[14:15], v21, v5
+; GFX7-NEXT:    s_or_b64 s[6:7], s[6:7], s[14:15]
+; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX7-NEXT:    v_add_i32_e64 v6, s[6:7], v19, v6
+; GFX7-NEXT:    s_or_b64 s[6:7], s[20:21], s[6:7]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v20, v0, s[6:7]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v1, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v2, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v12, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v11, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v25, vcc
+; GFX7-NEXT:    v_addc_u32_e64 v0, vcc, v0, v22, s[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v16
+; GFX7-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v16, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX8-NEXT:    v_mov_b32_e32 v17, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v18, v23
-; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v0, v20
-; GFX8-NEXT:    v_mov_b32_e32 v1, v23
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v20, v6, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v23, v5, v10
-; GFX8-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v13, v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v2, v22
-; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GFX8-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, v22, s[6:7]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17]
+; GFX8-NEXT:    v_mov_b32_e32 v16, v19
+; GFX8-NEXT:    v_mov_b32_e32 v17, v20
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17]
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s[22:23]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[14:15]
+; GFX8-NEXT:    v_addc_u32_e64 v25, s[10:11], 0, v25, s[10:11]
+; GFX8-NEXT:    v_add_u32_e64 v23, s[14:15], v23, v19
+; GFX8-NEXT:    v_add_u32_e64 v24, s[16:17], v22, v20
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0
+; GFX8-NEXT:    v_addc_u32_e64 v25, s[12:13], 0, v25, s[12:13]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20]
+; GFX8-NEXT:    v_addc_u32_e64 v25, s[6:7], 0, v25, s[6:7]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20]
+; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20]
+; GFX8-NEXT:    v_mov_b32_e32 v22, v19
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22]
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX8-NEXT:    v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22]
+; GFX8-NEXT:    v_addc_u32_e64 v19, s[4:5], 0, v19, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22]
+; GFX8-NEXT:    v_addc_u32_e64 v19, s[8:9], 0, v19, s[8:9]
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22]
 ; GFX8-NEXT:    v_mul_lo_u32 v12, v3, v12
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX8-NEXT:    v_mul_lo_u32 v10, v16, v15
-; GFX8-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT:    v_mul_lo_u32 v11, v4, v11
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22]
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22]
+; GFX8-NEXT:    v_add_u32_e64 v21, s[6:7], v25, v21
+; GFX8-NEXT:    v_add_u32_e64 v19, s[20:21], v19, v22
+; GFX8-NEXT:    v_mul_lo_u32 v22, v6, v9
+; GFX8-NEXT:    v_mul_lo_u32 v25, v5, v10
+; GFX8-NEXT:    v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[22:23]
+; GFX8-NEXT:    v_add_u32_e64 v3, s[22:23], v23, v3
+; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[22:23]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[14:15]
+; GFX8-NEXT:    v_add_u32_e64 v4, s[14:15], v24, v4
+; GFX8-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[14:15]
+; GFX8-NEXT:    v_add_u32_e64 v5, s[14:15], v21, v5
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[14:15]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX8-NEXT:    v_add_u32_e64 v6, s[6:7], v19, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], s[20:21], s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v20, v0, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v1, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v2, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v12, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v11, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v25, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v0, vcc, v0, v22, s[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v16
+; GFX8-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v16, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX9-NEXT:    v_mov_b32_e32 v17, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v18, v23
-; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, v20
-; GFX9-NEXT:    v_mov_b32_e32 v1, v23
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v20, v6, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v10
-; GFX9-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v13, v2, v13
-; GFX9-NEXT:    v_mov_b32_e32 v2, v22
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v10, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[20:21], v1, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[14:15], v1, v9, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], vcc, v2, v10, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v8, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v9, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[14:15]
+; GFX9-NEXT:    v_addc_co_u32_e64 v22, s[6:7], 0, v22, s[6:7]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v4, v8, v[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v16, v19
+; GFX9-NEXT:    v_mov_b32_e32 v17, v20
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[22:23], v0, v11, v[16:17]
+; GFX9-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s[22:23]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[10:11], v1, v10, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[12:13], v0, v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[12:13], v2, v9, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[17:18], s[14:15], v0, v9, v[17:18]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[6:7], v3, v8, v[19:20]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[14:15]
+; GFX9-NEXT:    v_addc_co_u32_e64 v25, s[10:11], 0, v25, s[10:11]
+; GFX9-NEXT:    v_add_co_u32_e64 v23, s[14:15], v23, v19
+; GFX9-NEXT:    v_add_co_u32_e64 v24, s[16:17], v22, v20
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v0, v14, 0
+; GFX9-NEXT:    v_addc_co_u32_e64 v25, s[12:13], 0, v25, s[12:13]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v1, v13, v[19:20]
+; GFX9-NEXT:    v_addc_co_u32_e64 v25, s[6:7], 0, v25, s[6:7]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v2, v12, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v3, v11, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v4, v10, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v5, v9, v[19:20]
+; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[18:19], v6, v8, v[19:20]
+; GFX9-NEXT:    v_mov_b32_e32 v22, v19
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[18:19], v0, v13, v[21:22]
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[20:21]
+; GFX9-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v19, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], vcc, v1, v12, v[21:22]
+; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[4:5], 0, v19, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v11, v[21:22]
+; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[8:9], 0, v19, s[8:9]
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[8:9], v3, v10, v[21:22]
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v3, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v16, v15
-; GFX9-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX9-NEXT:    v_mul_lo_u32 v11, v4, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v4, v9, v[21:22]
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[12:13], v5, v8, v[21:22]
+; GFX9-NEXT:    v_add_co_u32_e64 v21, s[6:7], v25, v21
+; GFX9-NEXT:    v_add_co_u32_e64 v19, s[20:21], v19, v22
+; GFX9-NEXT:    v_mul_lo_u32 v22, v6, v9
+; GFX9-NEXT:    v_mul_lo_u32 v25, v5, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[9:10], s[22:23], v1, v8, v[17:18]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[22:23]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[22:23], v23, v3
+; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[14:15]
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[14:15], v24, v4
+; GFX9-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[14:15]
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[14:15], v21, v5
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[14:15]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[6:7], v19, v6
+; GFX9-NEXT:    s_or_b64 s[6:7], s[20:21], s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v20, v0, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v1, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v2, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v12, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v11, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v25, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, vcc, v0, v22, s[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v16
+; GFX9-NEXT:    v_mov_b32_e32 v1, v9
+; GFX9-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:
@@ -2246,69 +2548,82 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v18, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX10-NEXT:    v_mul_lo_u32 v28, v4, v11
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], s4, v16, v12, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], s5, v16, v10, 0
+; GFX10-NEXT:    v_mul_lo_u32 v29, v5, v10
+; GFX10-NEXT:    v_mul_lo_u32 v14, v17, v14
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
+; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v18, v12, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], s4, v17, v11, v[19:20]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], s4, v17, v9, v[21:22]
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v20, v22
-; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT:    v_mov_b32_e32 v20, v18
-; GFX10-NEXT:    v_mov_b32_e32 v19, v22
-; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], s4, v18, v8, v[21:22]
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v25, s4, 0, v25, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[23:24], s5, v6, v8, v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v22
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v1, v19
+; GFX10-NEXT:    v_mov_b32_e32 v19, v20
+; GFX10-NEXT:    v_mov_b32_e32 v20, v23
+; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v16, v11, v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
-; GFX10-NEXT:    v_mov_b32_e32 v13, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v21
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
+; GFX10-NEXT:    v_mul_lo_u32 v13, v18, v13
+; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v17, v10, v[22:23]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s4, 0, v2, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], s5, v17, v12, v[19:20]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v21
+; GFX10-NEXT:    v_mul_lo_u32 v12, v3, v12
+; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v18, v9, v[22:23]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[19:20], s6, v18, v11, v[19:20]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s7, v16, v9, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], s4, v3, v8, v[22:23]
+; GFX10-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s7
+; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s7, v3, v10, v[19:20]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v19, s4, 0, v6, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v17, v8, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v3, s8, v23, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v4, v9, v[10:11]
+; GFX10-NEXT:    v_add_co_u32 v4, s10, v25, v22
+; GFX10-NEXT:    v_add_co_u32 v3, s9, v3, v6
+; GFX10-NEXT:    s_or_b32 s8, s8, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[9:10]
+; GFX10-NEXT:    v_mul_lo_u32 v10, v16, v15
+; GFX10-NEXT:    v_add_co_u32 v4, s9, v4, v11
+; GFX10-NEXT:    s_or_b32 s9, s10, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s9
+; GFX10-NEXT:    v_add_co_u32 v5, s9, v19, v5
+; GFX10-NEXT:    v_add_co_u32 v6, s11, v26, v6
+; GFX10-NEXT:    v_add_co_u32 v5, s10, v5, v9
+; GFX10-NEXT:    s_or_b32 s9, s9, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s9
+; GFX10-NEXT:    v_add_co_u32 v6, s9, v6, v9
+; GFX10-NEXT:    s_or_b32 s9, s11, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v24, v10, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v14, s8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v13, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v12, s7
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v28, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v29, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v8, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2316,69 +2631,81 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
+; GFX11-NEXT:    v_mov_b32_e32 v18, v2
 ; GFX11-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX11-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], null, v16, v12, 0
-; GFX11-NEXT:    v_mul_lo_u32 v30, v17, v14
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], null, v16, v12, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], null, v16, v10, 0
+; GFX11-NEXT:    v_mul_lo_u32 v28, v4, v11
+; GFX11-NEXT:    v_mul_lo_u32 v29, v5, v10
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], s0, v17, v11, v[19:20]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], s0, v17, v9, v[21:22]
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v18, v12, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX11-NEXT:    v_mul_lo_u32 v14, v17, v14
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX11-NEXT:    v_mov_b32_e32 v20, v22
-; GFX11-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20]
-; GFX11-NEXT:    v_mov_b32_e32 v20, v18
-; GFX11-NEXT:    v_mov_b32_e32 v19, v22
-; GFX11-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20]
+; GFX11-NEXT:    v_mad_u64_u32 v[23:24], null, v6, v8, v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v1, v19
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], s0, v18, v8, v[21:22]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v25, s0, 0, v25, s0
+; GFX11-NEXT:    v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v23
+; GFX11-NEXT:    v_mov_b32_e32 v0, v22
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20]
+; GFX11-NEXT:    v_mul_lo_u32 v13, v18, v13
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], s0, v16, v11, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v8, 0
-; GFX11-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX11-NEXT:    v_mul_lo_u32 v25, v3, v12
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX11-NEXT:    v_mov_b32_e32 v14, v21
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19]
-; GFX11-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX11-NEXT:    v_mov_b32_e32 v13, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX11-NEXT:    v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14]
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], s1, v17, v12, v[19:20]
+; GFX11-NEXT:    v_mul_lo_u32 v12, v3, v12
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], s0, v17, v10, v[22:23]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s0, 0, v2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v21
+; GFX11-NEXT:    v_mad_u64_u32 v[19:20], s2, v18, v11, v[19:20]
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], s0, v18, v9, v[22:23]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s3, v16, v9, v[1:2]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], s0, v3, v8, v[22:23]
+; GFX11-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s3
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s3, v3, v10, v[19:20]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v19, s0, 0, v6, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v17, v8, v[1:2]
+; GFX11-NEXT:    v_add_co_u32 v3, s4, v23, v21
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], s0, v4, v9, v[10:11]
+; GFX11-NEXT:    v_add_co_u32 v4, s6, v25, v22
+; GFX11-NEXT:    v_add_co_u32 v3, s5, v3, v6
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v8, v[9:10]
+; GFX11-NEXT:    v_mul_lo_u32 v10, v16, v15
+; GFX11-NEXT:    v_add_co_u32 v4, s5, v4, v11
+; GFX11-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX11-NEXT:    v_add_co_u32 v5, s5, v19, v5
+; GFX11-NEXT:    v_add_co_u32 v6, s7, v26, v6
+; GFX11-NEXT:    v_add_co_u32 v5, s6, v5, v9
+; GFX11-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX11-NEXT:    v_add_co_u32 v6, s5, v6, v9
+; GFX11-NEXT:    s_or_b32 s5, s7, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s5, v24, v10, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v14, s4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v13, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v12, s3
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v28, s2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v29, s1
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, v8, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2390,90 +2717,104 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
+; GFX12-NEXT:    v_mov_b32_e32 v18, v2
 ; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v8
-; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], null, v16, v12, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], null, v16, v10, 0
+; GFX12-NEXT:    v_mul_lo_u32 v28, v4, v11
+; GFX12-NEXT:    v_mul_lo_u32 v29, v5, v10
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], s0, v17, v11, v[19:20]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], s0, v17, v9, v[21:22]
+; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v18, v12, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], vcc_lo, v18, v10, v[19:20]
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX12-NEXT:    v_mul_lo_u32 v14, v17, v14
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], vcc_lo, v3, v9, v[19:20]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], vcc_lo, v4, v8, v[19:20]
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v2, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v20, v22
-; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v19, v22
-; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[23:24], null, v6, v8, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v1, v19
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], s0, v18, v8, v[21:22]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v25, s0, 0, v25, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_dual_mov_b32 v19, v20 :: v_dual_mov_b32 v20, v23
+; GFX12-NEXT:    v_mov_b32_e32 v0, v22
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], vcc_lo, v16, v13, v[19:20]
+; GFX12-NEXT:    v_mul_lo_u32 v13, v18, v13
+; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], s0, v16, v11, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v18
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT:    v_mul_lo_u32 v25, v3, v12
-; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX12-NEXT:    v_mov_b32_e32 v13, v1
-; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
-; GFX12-NEXT:    v_mov_b32_e32 v14, v21
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
-; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
-; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], s1, v17, v12, v[19:20]
+; GFX12-NEXT:    v_mul_lo_u32 v12, v3, v12
+; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], s0, v17, v10, v[22:23]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s0, 0, v2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v2, v21
+; GFX12-NEXT:    v_mad_co_u64_u32 v[19:20], s2, v18, v11, v[19:20]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], s0, v18, v9, v[22:23]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s3, v16, v9, v[1:2]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], s0, v3, v8, v[22:23]
+; GFX12-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s3
+; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[19:20]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v19, s0, 0, v6, s0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v17, v8, v[1:2]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_add_co_u32 v3, s4, v23, v21
+; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], s0, v4, v9, v[10:11]
+; GFX12-NEXT:    v_add_co_u32 v4, s6, v25, v22
+; GFX12-NEXT:    v_add_co_u32 v3, s5, v3, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s4, s4, s5
+; GFX12-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[9:10]
+; GFX12-NEXT:    v_mul_lo_u32 v10, v16, v15
+; GFX12-NEXT:    v_add_co_u32 v4, s5, v4, v11
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s5, s6, s5
+; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_co_u32 v5, s5, v19, v5
+; GFX12-NEXT:    v_add_co_u32 v6, s7, v26, v6
+; GFX12-NEXT:    v_add_co_u32 v5, s6, v5, v9
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_or_b32 s5, s5, s6
+; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT:    v_add_co_u32 v6, s5, v6, v9
+; GFX12-NEXT:    s_or_b32 s5, s7, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s5, v24, v10, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v14, s4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v13, s0
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v12, s3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v28, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s0, v9, v29, s1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v27, vcc_lo
 ; GFX12-NEXT:    v_add_nc_u32_e32 v7, v8, v7
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i256 %num, %den
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 1821d29d4b050b..ae6bcb6b082027 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2618,10 +2618,13 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2630,24 +2633,45 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[4:5], v1, v2
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_uaddsat_i48:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i48:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v2
+; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v2
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uaddsat_i48:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX11-NEXT:    v_add_co_u32 v0, s1, v0, v2
+; GFX11-NEXT:    v_add_co_u32 v1, s0, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_add_co_u32 v1, s1, v1, v2
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
 }
@@ -2677,7 +2701,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
-; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_or_b32 s2, s3, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -2687,7 +2718,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -2697,7 +2735,14 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s2
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s2
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s2, s3, s2
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -2728,11 +2773,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s1, v1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2740,11 +2787,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s1, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -2752,10 +2801,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s1, s1, v1
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s0, v1, v2
+; GFX10PLUS-NEXT:    s_or_b32 s0, s1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
@@ -2787,11 +2839,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s1, v1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2799,11 +2853,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s1, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -2811,10 +2867,13 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT:    v_add_co_u32 v0, s0, v0, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s1, v1, s1
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s0, v1, v2
+; GFX10PLUS-NEXT:    s_or_b32 s0, s1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
@@ -2827,38 +2886,62 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v2
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v2
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[4:5], v1, v2
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_uaddsat_i64:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_uaddsat_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v2
+; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v2
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uaddsat_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, s1, v0, v2
+; GFX11-NEXT:    v_add_co_u32 v1, s0, v1, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_add_co_u32 v1, s1, v1, v2
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
 }
@@ -2867,28 +2950,56 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
-; GFX6-NEXT:    s_addc_u32 s1, s1, s3
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s3
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s2
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_or_b32 s2, s3, s2
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
-; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_or_b32 s2, s3, s2
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_uaddsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_uaddsat_i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s2
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s2
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s2, s3, s2
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
@@ -2898,37 +3009,46 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX6-LABEL: uaddsat_i64_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s1, v1
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v2
+; GFX6-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: uaddsat_i64_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s1, v1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: uaddsat_i64_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s1, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: uaddsat_i64_sv:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s1, s1, v1
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s0, v1, v2
+; GFX10PLUS-NEXT:    s_or_b32 s0, s1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
   %cast = bitcast i64 %result to <2 x float>
@@ -2938,37 +3058,46 @@ define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: uaddsat_i64_vs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s1, v1
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v2
+; GFX6-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: uaddsat_i64_vs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s1, v1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: uaddsat_i64_vs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s1, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v2
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: uaddsat_i64_vs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT:    v_add_co_u32 v0, s0, v0, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s1, v1, s1
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s0, v1, v2
+; GFX10PLUS-NEXT:    s_or_b32 s0, s1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
   %cast = bitcast i64 %result to <2 x float>
@@ -2979,51 +3108,75 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v4
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v4
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v4
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
+; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v6
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[4:5], v1, v4
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[4:5], v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[4:5], v3, v4
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT:    v_add_co_u32 v2, s4, v2, v6
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s4, v3, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v4
+; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v2, s6, v2, v6
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v1, s6, v1, v4
+; GFX10-NEXT:    s_or_b32 s4, s4, s6
+; GFX10-NEXT:    v_add_co_u32 v3, s7, v3, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s4
+; GFX10-NEXT:    s_or_b32 s4, s5, s7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3031,12 +3184,18 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-LABEL: v_uaddsat_v2i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT:    v_add_co_u32 v2, s0, v2, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v3, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, s1, v0, v4
+; GFX11-NEXT:    v_add_co_u32 v1, s0, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    v_add_co_u32 v2, s2, v2, v6
+; GFX11-NEXT:    v_add_co_u32 v3, s1, v3, v7
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s2
+; GFX11-NEXT:    v_add_co_u32 v1, s2, v1, v4
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:    v_add_co_u32 v3, s3, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -1, s0
+; GFX11-NEXT:    s_or_b32 s0, s1, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3048,40 +3207,96 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_uaddsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s4
-; GFX6-NEXT:    s_addc_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_or_b32 s4, s5, s4
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    s_add_u32 s2, s2, s6
-; GFX6-NEXT:    s_addc_u32 s3, s3, s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_add_u32 s3, s3, s7
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_add_u32 s3, s3, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_or_b32 s4, s5, s4
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
-; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s2, s6
-; GFX8-NEXT:    s_addc_u32 s3, s3, s7
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_add_u32 s3, s3, s7
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_add_u32 s3, s3, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_uaddsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s2, s2, s6
-; GFX9-NEXT:    s_addc_u32 s3, s3, s7
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_add_u32 s3, s3, s7
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_add_u32 s3, s3, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_uaddsat_v2i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s4
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s4, s5, s4
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s6
-; GFX10PLUS-NEXT:    s_addc_u32 s3, s3, s7
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s3, s3, s7
+; GFX10PLUS-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s3, s3, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s4, s5, s4
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
@@ -3092,8 +3307,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s4
-; GFX6-NEXT:    s_addc_u32 s1, s1, s5
-; GFX6-NEXT:    s_addc_u32 s2, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_or_b32 s4, s5, s4
+; GFX6-NEXT:    s_add_u32 s2, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_add_u32 s2, s2, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_or_b32 s4, s5, s4
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
@@ -3102,8 +3329,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_uaddsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
-; GFX8-NEXT:    s_addc_u32 s1, s1, s5
-; GFX8-NEXT:    s_addc_u32 s2, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    s_add_u32 s2, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_add_u32 s2, s2, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
@@ -3112,8 +3351,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_uaddsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
-; GFX9-NEXT:    s_addc_u32 s1, s1, s5
-; GFX9-NEXT:    s_addc_u32 s2, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-NEXT:    s_add_u32 s2, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_or_b32 s4, s5, s4
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
@@ -3122,8 +3373,20 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10PLUS-LABEL: s_uaddsat_i128:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s4
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
-; GFX10PLUS-NEXT:    s_addc_u32 s2, s2, s6
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s4, s5, s4
+; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s6
+; GFX10PLUS-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s4, s5, s4
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
@@ -3135,13 +3398,17 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-LABEL: uaddsat_i128_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s1, v1
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v4
+; GFX6-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], s2, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3150,13 +3417,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ;
 ; GFX8-LABEL: uaddsat_i128_sv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s1, v1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v4
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], s2, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3165,13 +3436,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ;
 ; GFX9-LABEL: uaddsat_i128_sv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s1, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v4
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[4:5], s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3180,9 +3455,15 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ;
 ; GFX10PLUS-LABEL: uaddsat_i128_sv:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10PLUS-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s1, s1, v1
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v2, s2, s2, v2
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s0, v1, v4
+; GFX10PLUS-NEXT:    s_or_b32 s0, s1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v2, s0, v2, v4
+; GFX10PLUS-NEXT:    s_or_b32 vcc_lo, s2, s0
 ; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
@@ -3197,13 +3478,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: uaddsat_i128_vs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s1, v1
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v4
+; GFX6-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], s2, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3212,13 +3497,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ;
 ; GFX8-LABEL: uaddsat_i128_vs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s1, v1
+; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v4
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], s2, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3227,13 +3516,17 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ;
 ; GFX9-LABEL: uaddsat_i128_vs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s1, v1
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v4
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[4:5], s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3242,9 +3535,15 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ;
 ; GFX10PLUS-LABEL: uaddsat_i128_vs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10PLUS-NEXT:    v_add_co_u32 v0, s0, v0, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s1, v1, s1
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v2, s2, v2, s2
+; GFX10PLUS-NEXT:    v_add_co_u32 v1, s0, v1, v4
+; GFX10PLUS-NEXT:    s_or_b32 s0, s1, s0
+; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10PLUS-NEXT:    v_add_co_u32 v2, s0, v2, v4
+; GFX10PLUS-NEXT:    s_or_b32 vcc_lo, s2, s0
 ; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
@@ -3260,17 +3559,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v10, vcc
+; GFX6-NEXT:    v_add_i32_e64 v0, s[4:5], v0, v8
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v2, s[6:7], v2, v10
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GFX6-NEXT:    s_or_b64 vcc, s[6:7], vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GFX6-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v13, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v6, v14, vcc
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
+; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_add_i32_e64 v6, s[6:7], v6, v14
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    s_or_b64 vcc, s[6:7], vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v15, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
@@ -3281,17 +3592,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-LABEL: v_uaddsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v10, vcc
+; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v8
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v8
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v2, s[6:7], v2, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
+; GFX8-NEXT:    s_or_b64 vcc, s[6:7], vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
+; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v4, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v12
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v13, vcc
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v6, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v5, s[4:5], v5, v8
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_add_u32_e64 v6, s[6:7], v6, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    s_or_b64 vcc, s[6:7], vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v15, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
@@ -3302,17 +3625,29 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-LABEL: v_uaddsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v1, s[4:5], v1, v8
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[6:7], v2, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX9-NEXT:    s_or_b64 vcc, s[6:7], vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v11, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v13, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v8
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[6:7], v6, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v8
+; GFX9-NEXT:    s_or_b64 vcc, s[6:7], vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
@@ -3323,18 +3658,30 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-LABEL: v_uaddsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v8
-; GFX10-NEXT:    v_add_co_u32 v4, s4, v4, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s4, v5, v13, s4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s4, v6, v14, s4
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v8
+; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v2, s6, v2, v10
+; GFX10-NEXT:    v_add_co_u32 v4, s7, v4, v12
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v8
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    v_add_co_u32 v5, s5, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s4
+; GFX10-NEXT:    v_add_co_u32 v2, s4, v2, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s7
+; GFX10-NEXT:    s_or_b32 vcc_lo, s6, s4
+; GFX10-NEXT:    v_add_co_u32 v6, s6, v6, v14
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v7, v15, s4
+; GFX10-NEXT:    v_add_co_u32 v5, s4, v5, v8
+; GFX10-NEXT:    s_or_b32 s4, s5, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v6, s4, v6, v8
+; GFX10-NEXT:    s_or_b32 s4, s6, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v7, v15, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, -1, s4
@@ -3344,18 +3691,30 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-LABEL: v_uaddsat_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v8
-; GFX11-NEXT:    v_add_co_u32 v4, s0, v4, v12
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s0, v5, v13, s0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s0, v6, v14, s0
+; GFX11-NEXT:    v_add_co_u32 v0, s1, v0, v8
+; GFX11-NEXT:    v_add_co_u32 v1, s0, v1, v9
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
+; GFX11-NEXT:    v_add_co_u32 v2, s2, v2, v10
+; GFX11-NEXT:    v_add_co_u32 v4, s3, v4, v12
+; GFX11-NEXT:    v_add_co_u32 v1, s1, v1, v8
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_add_co_u32 v5, s1, v5, v13
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX11-NEXT:    v_add_co_u32 v2, s0, v2, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s3
+; GFX11-NEXT:    s_or_b32 vcc_lo, s2, s0
+; GFX11-NEXT:    v_add_co_u32 v6, s2, v6, v14
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s0, v7, v15, s0
+; GFX11-NEXT:    v_add_co_u32 v5, s0, v5, v8
+; GFX11-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v6, s0, v6, v8
+; GFX11-NEXT:    s_or_b32 s0, s2, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s0, v7, v15, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, -1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, -1, s0
@@ -3369,14 +3728,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_uaddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s8
-; GFX6-NEXT:    s_addc_u32 s1, s1, s9
-; GFX6-NEXT:    s_addc_u32 s2, s2, s10
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s9
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_add_u32 s1, s1, s8
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_or_b32 s8, s9, s8
+; GFX6-NEXT:    s_add_u32 s2, s2, s10
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_or_b32 s8, s9, s8
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX6-NEXT:    s_add_u32 s4, s4, s12
-; GFX6-NEXT:    s_addc_u32 s5, s5, s13
-; GFX6-NEXT:    s_addc_u32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_add_u32 s5, s5, s13
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_add_u32 s5, s5, s8
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_or_b32 s8, s9, s8
+; GFX6-NEXT:    s_add_u32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_add_u32 s6, s6, s8
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_or_b32 s8, s9, s8
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s7, s7, s15
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
@@ -3385,14 +3768,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_uaddsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s8
-; GFX8-NEXT:    s_addc_u32 s1, s1, s9
-; GFX8-NEXT:    s_addc_u32 s2, s2, s10
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s9
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_add_u32 s1, s1, s8
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_or_b32 s8, s9, s8
+; GFX8-NEXT:    s_add_u32 s2, s2, s10
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_add_u32 s2, s2, s8
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_or_b32 s8, s9, s8
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX8-NEXT:    s_add_u32 s4, s4, s12
-; GFX8-NEXT:    s_addc_u32 s5, s5, s13
-; GFX8-NEXT:    s_addc_u32 s6, s6, s14
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_add_u32 s5, s5, s13
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_add_u32 s5, s5, s8
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_or_b32 s8, s9, s8
+; GFX8-NEXT:    s_add_u32 s6, s6, s14
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_add_u32 s6, s6, s8
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_or_b32 s8, s9, s8
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s7, s7, s15
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
@@ -3401,14 +3808,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_uaddsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s8
-; GFX9-NEXT:    s_addc_u32 s1, s1, s9
-; GFX9-NEXT:    s_addc_u32 s2, s2, s10
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s9
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_add_u32 s1, s1, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_or_b32 s8, s9, s8
+; GFX9-NEXT:    s_add_u32 s2, s2, s10
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_add_u32 s2, s2, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_or_b32 s8, s9, s8
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s4, s4, s12
-; GFX9-NEXT:    s_addc_u32 s5, s5, s13
-; GFX9-NEXT:    s_addc_u32 s6, s6, s14
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_add_u32 s5, s5, s13
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_or_b32 s8, s9, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s14
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_add_u32 s6, s6, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_or_b32 s8, s9, s8
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s7, s7, s15
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
@@ -3417,14 +3848,38 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10PLUS-LABEL: s_uaddsat_v2i128:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s9
-; GFX10PLUS-NEXT:    s_addc_u32 s2, s2, s10
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s9
+; GFX10PLUS-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s1, s1, s8
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s8, s9, s8
+; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s10
+; GFX10PLUS-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s8
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s8, s9, s8
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s3, s3, s11
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX10PLUS-NEXT:    s_add_u32 s4, s4, s12
-; GFX10PLUS-NEXT:    s_addc_u32 s5, s5, s13
-; GFX10PLUS-NEXT:    s_addc_u32 s6, s6, s14
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s5, s5, s13
+; GFX10PLUS-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s5, s5, s8
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s8, s9, s8
+; GFX10PLUS-NEXT:    s_add_u32 s6, s6, s14
+; GFX10PLUS-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_add_u32 s6, s6, s8
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_or_b32 s8, s9, s8
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s7, s7, s15
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]



More information about the llvm-commits mailing list