[llvm] [GlobalIsel] Combine ADDO (PR #82927)
Thorsten Schütt via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 14 00:37:33 PDT 2024
https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/82927
>From d2db0143afefed02f22ef2e800de097aae563b22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sat, 24 Feb 2024 20:04:01 +0100
Subject: [PATCH 1/5] [GlobalIsel] Combine ADDO
Perform the requested arithmetic and produce a carry output in
addition to the normal result.
Clang has them as builtins (__builtin_add_overflow_p). The middle end
has intrinsics for them (sadd_with_overflow).
AArch64: ADDS Add and set flags
On Neoverse V2, they run at half the throughput of basic arithmetic
and have a limited set of pipelines.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 12 +-
.../CodeGen/GlobalISel/GenericMachineInstrs.h | 19 +
.../include/llvm/Target/GlobalISel/Combine.td | 16 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 234 +++++++++-
.../AArch64/GlobalISel/combine-overflow.mir | 94 ++++
.../prelegalizer-combiner-addo-zero.mir | 2 +-
llvm/test/CodeGen/AArch64/overflow.ll | 78 ++--
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 232 +++++----
.../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 441 +++++++++---------
.../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 408 ++++++++--------
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 229 +++++----
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 8 -
.../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 8 -
13 files changed, 1029 insertions(+), 752 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 23728636498ba0..9e8fc5d635c50a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -696,10 +696,6 @@ class CombinerHelper {
/// (G_*MULO x, 0) -> 0 + no carry out
bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
- /// Match:
- /// (G_*ADDO x, 0) -> x + no carry out
- bool matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
-
/// Match:
/// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
/// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -810,12 +806,15 @@ class CombinerHelper {
/// Combine selects.
bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
- /// Combine ands,
+ /// Combine ands.
bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
- /// Combine ors,
+ /// Combine ors.
bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// Combine addos.
+ bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
+
private:
/// Checks for legality of an indexed variant of \p LdSt.
bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -919,6 +918,7 @@ class CombinerHelper {
bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
bool isConstantSplatVector(Register Src, int64_t SplatValue,
bool AllowUndefs);
+ bool isConstantOrConstantVectorI(Register Src) const;
std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index f5a6528d10a973..6b03703192df91 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -359,6 +359,8 @@ class GBinOpCarryOut : public GenericMachineInstr {
Register getCarryOutReg() const { return getReg(1); }
MachineOperand &getLHS() { return getOperand(2); }
MachineOperand &getRHS() { return getOperand(3); }
+ Register getLHSReg() const { return getOperand(2).getReg(); }
+ Register getRHSReg() const { return getOperand(3).getReg(); }
static bool classof(const MachineInstr *MI) {
switch (MI->getOpcode()) {
@@ -429,6 +431,23 @@ class GAddSubCarryOut : public GBinOpCarryOut {
}
};
+/// Represents overflowing add operations.
+/// G_UADDO, G_SADDO
+class GAddCarryOut : public GBinOpCarryOut {
+public:
+ bool isSigned() const { return getOpcode() == TargetOpcode::G_SADDO; }
+
+ static bool classof(const MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_SADDO:
+ return true;
+ default:
+ return false;
+ }
+ }
+};
+
/// Represents overflowing add/sub operations that also consume a carry-in.
/// G_UADDE, G_SADDE, G_USUBE, G_SSUBE
class GAddSubCarryInOut : public GAddSubCarryOut {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9f18a5b8560098..6980cbd04aeb1c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1090,12 +1090,6 @@ def mulo_by_0: GICombineRule<
[{ return Helper.matchMulOBy0(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-def addo_by_0: GICombineRule<
- (defs root:$root, build_fn_matchinfo:$matchinfo),
- (match (wip_match_opcode G_UADDO, G_SADDO):$root,
- [{ return Helper.matchAddOBy0(*${root}, ${matchinfo}); }]),
- (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
-
// Transform (uadde x, y, 0) -> (uaddo x, y)
// (sadde x, y, 0) -> (saddo x, y)
// (usube x, y, 0) -> (usubo x, y)
@@ -1291,6 +1285,12 @@ def match_ors : GICombineRule<
[{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+def match_addos : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_SADDO, G_UADDO):$root,
+ [{ return Helper.matchAddOverflow(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
// Combines concat operations
def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
def combine_concat_vector : GICombineRule<
@@ -1326,7 +1326,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
overlapping_and, mulo_by_2, mulo_by_0,
- addo_by_0, adde_to_addo,
+ adde_to_addo,
combine_minmax_nan]>;
def known_bits_simplifications : GICombineGroup<[
@@ -1374,7 +1374,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
and_or_disjoint_mask, fma_combines, fold_binop_into_select,
sub_add_reg, select_to_minmax, redundant_binop_in_equality,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
- combine_concat_vector, double_icmp_zero_and_or_combine]>;
+ combine_concat_vector, double_icmp_zero_and_or_combine, match_addos]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 2e706b46801938..779af42ae70807 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -4936,24 +4936,6 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
return true;
}
-bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
- // (G_*ADDO x, 0) -> x + no carry out
- assert(MI.getOpcode() == TargetOpcode::G_UADDO ||
- MI.getOpcode() == TargetOpcode::G_SADDO);
- if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
- return false;
- Register Carry = MI.getOperand(1).getReg();
- if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
- return false;
- Register Dst = MI.getOperand(0).getReg();
- Register LHS = MI.getOperand(2).getReg();
- MatchInfo = [=](MachineIRBuilder &B) {
- B.buildCopy(Dst, LHS);
- B.buildConstant(Carry, 0);
- };
- return true;
-}
-
bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
@@ -6354,6 +6336,26 @@ CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
return Value;
}
+// FIXME G_SPLAT_VECTOR
+bool CombinerHelper::isConstantOrConstantVectorI(Register Src) const {
+ auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+ if (IConstant)
+ return true;
+
+ GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+ if (!BuildVector)
+ return false;
+
+ unsigned NumSources = BuildVector->getNumSources();
+ for (unsigned I = 0; I < NumSources; ++I) {
+ std::optional<ValueAndVReg> IConstant =
+ getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+ if (!IConstant)
+ return false;
+ }
+ return true;
+}
+
// TODO: use knownbits to determine zeros
bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
BuildFnTy &MatchInfo) {
@@ -6928,3 +6930,199 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
return false;
}
+
+bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
+ GAddCarryOut *Add = cast<GAddCarryOut>(&MI);
+
+ // Addo has no flags
+ Register Dst = Add->getReg(0);
+ Register Carry = Add->getReg(1);
+ Register LHS = Add->getLHSReg();
+ Register RHS = Add->getRHSReg();
+ bool IsSigned = Add->isSigned();
+ LLT DstTy = MRI.getType(Dst);
+ LLT CarryTy = MRI.getType(Carry);
+
+ // We want do fold the [u|s]addo.
+ if (!MRI.hasOneNonDBGUse(Dst))
+ return false;
+
+ // Fold addo, if the carry is dead -> add, undef.
+ if (MRI.use_nodbg_empty(Carry) &&
+ isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}})) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS);
+ B.buildUndef(Carry);
+ };
+ return true;
+ }
+
+ // We want do fold the [u|s]addo.
+ if (!MRI.hasOneNonDBGUse(Carry))
+ return false;
+
+ // Canonicalize constant to RHS.
+ if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) {
+ if (IsSigned) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildSAddo(Dst, Carry, RHS, LHS);
+ };
+ return true;
+ } else {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildUAddo(Dst, Carry, RHS, LHS);
+ };
+ return true;
+ }
+ }
+
+ std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS);
+ std::optional<APInt> MaybeRHS = getConstantOrConstantSplatVector(RHS);
+
+ // Fold addo(c1, c2) -> c3, carry.
+ if (MaybeLHS && MaybeRHS && isConstantLegalOrBeforeLegalizer(DstTy) &&
+ isConstantLegalOrBeforeLegalizer(CarryTy)) {
+ // They must both have the same bitwidth. Otherwise APInt might
+ // assert. Pre legalization, they may have widely different bitwidths.
+ unsigned BitWidth =
+ std::max(MaybeLHS->getBitWidth(), MaybeRHS->getBitWidth());
+ bool Overflow;
+ APInt Result;
+ if (IsSigned) {
+ APInt LHS = MaybeLHS->sext(BitWidth);
+ APInt RHS = MaybeRHS->sext(BitWidth);
+ Result = LHS.sadd_ov(RHS, Overflow);
+ } else {
+ APInt LHS = MaybeLHS->zext(BitWidth);
+ APInt RHS = MaybeRHS->zext(BitWidth);
+ Result = LHS.uadd_ov(RHS, Overflow);
+ }
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildConstant(Dst, Result);
+ B.buildConstant(Carry, Overflow);
+ };
+ return true;
+ }
+
+ // Fold (addo x, 0) -> x, no borrow
+ if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildCopy(Dst, LHS);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+
+ // Given 2 constant operands whose sum does not overflow:
+ // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
+ // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
+ GAdd *AddLHS = getOpcodeDef<GAdd>(LHS, MRI);
+ if (MaybeRHS && AddLHS && MRI.hasOneNonDBGUse(Add->getReg(0)) &&
+ ((IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoSWrap)) ||
+ (!IsSigned && AddLHS->getFlag(MachineInstr::MIFlag::NoUWrap)))) {
+ std::optional<APInt> MaybeAddRHS =
+ getConstantOrConstantSplatVector(AddLHS->getRHSReg());
+ if (MaybeAddRHS) {
+ unsigned BitWidth =
+ std::max(MaybeRHS->getBitWidth(), MaybeAddRHS->getBitWidth());
+ bool Overflow;
+ APInt NewC;
+ if (IsSigned) {
+ APInt LHS = MaybeRHS->sext(BitWidth);
+ APInt RHS = MaybeAddRHS->sext(BitWidth);
+ NewC = LHS.sadd_ov(RHS, Overflow);
+ } else {
+ APInt LHS = MaybeRHS->zext(BitWidth);
+ APInt RHS = MaybeAddRHS->zext(BitWidth);
+ NewC = LHS.uadd_ov(RHS, Overflow);
+ }
+ if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) {
+ if (IsSigned) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto ConstRHS = B.buildConstant(DstTy, NewC);
+ B.buildSAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+ };
+ return true;
+ } else {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto ConstRHS = B.buildConstant(DstTy, NewC);
+ B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+ };
+ return true;
+ }
+ }
+ }
+ };
+
+ // We try to combine uaddo to non-overflowing add.
+ if (!IsSigned && isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) &&
+ isConstantLegalOrBeforeLegalizer(DstTy)) {
+ ConstantRange CRLHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(LHS), false /*IsSigned*/);
+ ConstantRange CRRHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(RHS), false /*IsSigned*/);
+
+ switch (CRLHS.unsignedAddMayOverflow(CRRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoUWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ };
+ return false;
+ };
+
+ // We try to combine saddo to non-overflowing add.
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) ||
+ !isConstantLegalOrBeforeLegalizer(CarryTy))
+ return false;
+
+ // If LHS and RHS each have at least two sign bits, then there is no signed
+ // overflow.
+ if (KB->computeNumSignBits(LHS) > 1 && KB->computeNumSignBits(RHS) > 1) {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+
+ ConstantRange CRLHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(LHS), true /*IsSigned*/);
+ ConstantRange CRRHS =
+ ConstantRange::fromKnownBits(KB->getKnownBits(RHS), true /*IsSigned*/);
+
+ switch (CRLHS.signedAddMayOverflow(CRRHS)) {
+ case ConstantRange::OverflowResult::MayOverflow:
+ return false;
+ case ConstantRange::OverflowResult::NeverOverflows: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
+ B.buildConstant(Carry, 0);
+ };
+ return true;
+ }
+ case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+ case ConstantRange::OverflowResult::AlwaysOverflowsHigh: {
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildAdd(Dst, LHS, RHS);
+ B.buildConstant(Carry, 1);
+ };
+ return true;
+ }
+ };
+
+ return false;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
new file mode 100644
index 00000000000000..2967230cea6174
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+
+---
+name: add_unused
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_unused
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: $w0 = COPY %add(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %add:_(s32), %o:_(s1) = G_SADDO %0, %1
+ $w0 = COPY %add(s32)
+ RET_ReallyLR implicit $w0
+...
+---
+name: add_canon
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_canon
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w1
+ ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 10
+ ; CHECK-NEXT: %add:_(s32), %o:_(s1) = G_SADDO [[COPY]], %const
+ ; CHECK-NEXT: %o_wide:_(s32) = G_ZEXT %o(s1)
+ ; CHECK-NEXT: $w0 = COPY %add(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %const:_(s32) = G_CONSTANT i32 10
+ %add:_(s32), %o:_(s1) = G_SADDO %const, %1
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %add(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
+---
+name: add_const_fold
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_const_fold
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %add:_(s32) = G_CONSTANT i32 21
+ ; CHECK-NEXT: %o_wide:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY %add(s32)
+ ; CHECK-NEXT: $w1 = COPY %o_wide(s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %const:_(s32) = G_CONSTANT i32 10
+ %const1:_(s32) = G_CONSTANT i32 11
+ %add:_(s32), %o:_(s1) = G_UADDO %const, %const1
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %add(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
+---
+name: add_add_zero
+body: |
+ bb.0:
+ liveins: $w0, $w1
+ ; CHECK-LABEL: name: add_add_zero
+ ; CHECK: liveins: $w0, $w1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w2
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $w1 = COPY [[C]](s32)
+ ; CHECK-NEXT: RET_ReallyLR implicit $w0
+ %0:_(s32) = COPY $w0
+ %1:_(s32) = COPY $w1
+ %2:_(s32) = COPY $w2
+ %const:_(s32) = G_CONSTANT i32 10
+ %addl:_(s32) = nsw G_ADD %2, %const
+ %const1:_(s32) = G_CONSTANT i32 -10
+ %add:_(s32), %o:_(s1) = G_SADDO %addl, %const1
+ %o_wide:_(s32) = G_ZEXT %o(s1)
+ $w0 = COPY %add(s32)
+ $w1 = COPY %o_wide
+ RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
index 94f56e5650b22f..9483cbf06f4057 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-addo-zero.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="addo_by_0" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple aarch64 -debugify-and-strip-all-safe -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombiner-only-enable-rule="match_addos" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
# REQUIRES: asserts
# (G_*ADDO x, 0) -> x + no carry
diff --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll
index 444aaeb0f3fe75..1fd60c03097906 100644
--- a/llvm/test/CodeGen/AArch64/overflow.ll
+++ b/llvm/test/CodeGen/AArch64/overflow.ll
@@ -19,20 +19,12 @@ entry:
}
define zeroext i1 @saddo1.i32.fold(i32 %v1, i32 %v2, ptr %res) {
-; SDAG-LABEL: saddo1.i32.fold:
-; SDAG: // %bb.0: // %entry
-; SDAG-NEXT: mov w8, #20 // =0x14
-; SDAG-NEXT: mov w0, wzr
-; SDAG-NEXT: str w8, [x2]
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: saddo1.i32.fold:
-; GISEL: // %bb.0: // %entry
-; GISEL-NEXT: mov w8, #9 // =0x9
-; GISEL-NEXT: adds w8, w8, #11
-; GISEL-NEXT: cset w0, vs
-; GISEL-NEXT: str w8, [x2]
-; GISEL-NEXT: ret
+; CHECK-LABEL: saddo1.i32.fold:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #20 // =0x14
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: str w8, [x2]
+; CHECK-NEXT: ret
entry:
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 9, i32 11)
%val = extractvalue {i32, i1} %t, 0
@@ -123,18 +115,11 @@ entry:
}
define zeroext i1 @saddo.canon.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; SDAG-LABEL: saddo.canon.i32:
-; SDAG: // %bb.0: // %entry
-; SDAG-NEXT: mov w0, wzr
-; SDAG-NEXT: str w4, [x5]
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: saddo.canon.i32:
-; GISEL: // %bb.0: // %entry
-; GISEL-NEXT: adds w8, wzr, w4
-; GISEL-NEXT: cset w0, vs
-; GISEL-NEXT: str w8, [x5]
-; GISEL-NEXT: ret
+; CHECK-LABEL: saddo.canon.i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: str w4, [x5]
+; CHECK-NEXT: ret
entry:
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 0, i32 %v5)
%val = extractvalue {i32, i1} %t, 0
@@ -143,13 +128,19 @@ entry:
ret i1 %obit
}
define zeroext i1 @saddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; CHECK-LABEL: saddo.add.i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add w8, w4, #100
-; CHECK-NEXT: subs w8, w8, #100
-; CHECK-NEXT: cset w0, vs
-; CHECK-NEXT: str w8, [x5]
-; CHECK-NEXT: ret
+; SDAG-LABEL: saddo.add.i32:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: add w8, w4, #100
+; SDAG-NEXT: subs w8, w8, #100
+; SDAG-NEXT: cset w0, vs
+; SDAG-NEXT: str w8, [x5]
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: saddo.add.i32:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: mov w0, wzr
+; GISEL-NEXT: str w4, [x5]
+; GISEL-NEXT: ret
entry:
%lhs = add nsw i32 %v5, 100
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %lhs, i32 -100)
@@ -160,13 +151,20 @@ entry:
}
define zeroext i1 @uaddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
-; CHECK-LABEL: uaddo.add.i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add w8, w4, #5
-; CHECK-NEXT: adds w8, w8, #5
-; CHECK-NEXT: cset w0, hs
-; CHECK-NEXT: str w8, [x5]
-; CHECK-NEXT: ret
+; SDAG-LABEL: uaddo.add.i32:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: add w8, w4, #5
+; SDAG-NEXT: adds w8, w8, #5
+; SDAG-NEXT: cset w0, hs
+; SDAG-NEXT: str w8, [x5]
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: uaddo.add.i32:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: adds w8, w4, #10
+; GISEL-NEXT: cset w0, hs
+; GISEL-NEXT: str w8, [x5]
+; GISEL-NEXT: ret
entry:
%lhs = add nuw i32 %v5, 5
%t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %lhs, i32 5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0ea89d98..a6f9bb7ee055d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: s_ashr_i32 s2, s7, 31
; GFX6-NEXT: s_ashr_i32 s5, s7, 15
-; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT: s_addk_i32 s2, 0x8000
; GFX6-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX8-NEXT: s_ashr_i32 s2, s7, 31
; GFX8-NEXT: s_ashr_i32 s5, s7, 15
-; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT: s_addk_i32 s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-LABEL: saddsat_i48_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: s_ashr_i32 s2, s5, 31
-; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX8-NEXT: s_ashr_i32 s2, s5, 31
-; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX6-NEXT: s_ashr_i32 s4, s9, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX6-NEXT: s_ashr_i32 s4, s1, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX8-NEXT: s_ashr_i32 s4, s9, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX8-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s4, s9, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX9-NEXT: s_ashr_i32 s4, s1, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
; GFX10-NEXT: s_ashr_i32 s4, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s8, s1, s0
; GFX10-NEXT: s_add_u32 s0, s2, s6
; GFX10-NEXT: s_addc_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX10-NEXT: s_ashr_i32 s4, s1, 31
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s1, s3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
; GFX11-NEXT: s_ashr_i32 s4, s9, 31
-; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s8, s1, s0
; GFX11-NEXT: s_add_u32 s0, s2, s6
; GFX11-NEXT: s_addc_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s1, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s9, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
@@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s9, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
@@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s9, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
-; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, s4
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX11-NEXT: v_mov_b32_e32 v2, s5
; GFX11-NEXT: s_ashr_i32 s0, s9, 31
-; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX9-NEXT: v_bfrev_b32_e32 v6, 1
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
-; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
@@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
@@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17
-; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4
@@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3
; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0
@@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s17, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
@@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
@@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s17, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
@@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s17, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
@@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: s_ashr_i32 s10, s17, 31
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT: s_add_u32 s11, s10, 0x80000000
+; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX10-NEXT: s_add_u32 s0, s4, s12
@@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v2, s17
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: v_readfirstlane_b32 s1, v4
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
@@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: s_and_b32 s1, 1, s1
; GFX11-NEXT: s_ashr_i32 s10, s17, 31
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT: s_add_u32 s11, s10, 0x80000000
+; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX11-NEXT: s_add_u32 s0, s4, s12
@@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0a6b7af2f78d4c..84906c01a4698a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -3091,253 +3091,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v1
-; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v9, v7
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6]
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v7
+; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2
-; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
-; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
-; GISEL-NEXT: v_trunc_f32_e32 v6, v6
-; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v2
+; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v1
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7]
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3
-; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0
+; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6]
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6]
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17
-; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2]
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12
-; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2]
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v16, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v8, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v8, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4
+; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1]
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v0
+; GISEL-NEXT: v_mul_hi_u32 v9, v3, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1]
; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index c455b24313ddc2..83ebc84e1f84a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
+; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v9, v7
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1
+; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0
-; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v6, v4
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5
-; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc
-; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4
+; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5
+; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4
+; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0
-; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19
+; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0
-; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12
-; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v4
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v14, v4
+; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v5
+; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v0, v6
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1]
; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4
-; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2
-; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
+; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 61e1e67b7ae360..320dfbb4980e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: s_ashr_i32 s2, s7, 31
; GFX6-NEXT: s_ashr_i32 s5, s7, 15
-; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT: s_addk_i32 s2, 0x8000
; GFX6-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX8-NEXT: s_ashr_i32 s2, s7, 31
; GFX8-NEXT: s_ashr_i32 s5, s7, 15
-; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT: s_addk_i32 s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-LABEL: ssubsat_i48_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: s_ashr_i32 s2, s5, 31
-; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX8-NEXT: s_ashr_i32 s2, s5, 31
-; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX6-NEXT: s_ashr_i32 s4, s9, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
; GFX6-NEXT: s_ashr_i32 s4, s1, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX8-NEXT: s_ashr_i32 s4, s9, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
; GFX8-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s4, s9, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
; GFX9-NEXT: s_ashr_i32 s4, s1, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
; GFX10-NEXT: s_ashr_i32 s4, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s8, s1, s0
; GFX10-NEXT: s_sub_u32 s0, s2, s6
; GFX10-NEXT: s_subb_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX10-NEXT: s_ashr_i32 s4, s1, 31
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s1, s3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
; GFX11-NEXT: s_ashr_i32 s4, s9, 31
-; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s8, s1, s0
; GFX11-NEXT: s_sub_u32 s0, s2, s6
; GFX11-NEXT: s_subb_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s1, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s11, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
@@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s11, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
@@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s11, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
@@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: v_mov_b32_e32 v3, s11
@@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
; GFX11-NEXT: v_mov_b32_e32 v3, s11
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
@@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
; GFX11-NEXT: s_and_b32 s0, 1, s4
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
@@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
@@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19
-; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4
@@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
+; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3
-; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0
@@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s19, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NEXT: v_mov_b32_e32 v3, s17
@@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
@@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s19, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: v_mov_b32_e32 v3, s17
@@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s19, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: v_mov_b32_e32 v3, s17
@@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: s_ashr_i32 s8, s17, 31
; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000
+; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_ashr_i32 s4, s3, 31
; GFX10-NEXT: s_and_b32 s5, 1, s5
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
@@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
; GFX11-NEXT: s_ashr_i32 s8, s19, 31
; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_add_u32 s9, s8, 0x80000000
+; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 887c43f5fce59e..d15551365707b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
@@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 5c6bb6dea16466..07480a0ce0c2e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
@@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
>From cea3d3dd09d58cf002882a5ddb5ed345151f42bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Sun, 25 Feb 2024 20:18:37 +0100
Subject: [PATCH 2/5] address review comments
---
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 779af42ae70807..ee0da852c7315b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7054,9 +7054,13 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
}
};
+ // We try to combine addo to non-overflowing add.
+ if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) ||
+ !isConstantLegalOrBeforeLegalizer(CarryTy))
+ return false;
+
// We try to combine uaddo to non-overflowing add.
- if (!IsSigned && isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) &&
- isConstantLegalOrBeforeLegalizer(DstTy)) {
+ if (!IsSigned) {
ConstantRange CRLHS =
ConstantRange::fromKnownBits(KB->getKnownBits(LHS), false /*IsSigned*/);
ConstantRange CRRHS =
@@ -7080,14 +7084,11 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
};
return true;
}
- };
+ }
return false;
- };
+ }
// We try to combine saddo to non-overflowing add.
- if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}}) ||
- !isConstantLegalOrBeforeLegalizer(CarryTy))
- return false;
// If LHS and RHS each have at least two sign bits, then there is no signed
// overflow.
@@ -7122,7 +7123,7 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
};
return true;
}
- };
+ }
return false;
}
>From 5addd195d7b727f9cc96b79b3b68671be96a271d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Mon, 26 Feb 2024 15:34:14 +0100
Subject: [PATCH 3/5] address review comments
---
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 10 +++++-----
.../CodeGen/AArch64/GlobalISel/combine-overflow.mir | 2 +-
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index ee0da852c7315b..348820ae186971 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7062,9 +7062,9 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
// We try to combine uaddo to non-overflowing add.
if (!IsSigned) {
ConstantRange CRLHS =
- ConstantRange::fromKnownBits(KB->getKnownBits(LHS), false /*IsSigned*/);
+ ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/false);
ConstantRange CRRHS =
- ConstantRange::fromKnownBits(KB->getKnownBits(RHS), false /*IsSigned*/);
+ ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/false);
switch (CRLHS.unsignedAddMayOverflow(CRRHS)) {
case ConstantRange::OverflowResult::MayOverflow:
@@ -7092,7 +7092,7 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
// If LHS and RHS each have at least two sign bits, then there is no signed
// overflow.
- if (KB->computeNumSignBits(LHS) > 1 && KB->computeNumSignBits(RHS) > 1) {
+ if (KB->computeNumSignBits(RHS) > 1 && KB->computeNumSignBits(LHS) > 1) {
MatchInfo = [=](MachineIRBuilder &B) {
B.buildAdd(Dst, LHS, RHS, MachineInstr::MIFlag::NoSWrap);
B.buildConstant(Carry, 0);
@@ -7101,9 +7101,9 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
}
ConstantRange CRLHS =
- ConstantRange::fromKnownBits(KB->getKnownBits(LHS), true /*IsSigned*/);
+ ConstantRange::fromKnownBits(KB->getKnownBits(LHS), /*IsSigned=*/true);
ConstantRange CRRHS =
- ConstantRange::fromKnownBits(KB->getKnownBits(RHS), true /*IsSigned*/);
+ ConstantRange::fromKnownBits(KB->getKnownBits(RHS), /*IsSigned=*/true);
switch (CRLHS.signedAddMayOverflow(CRRHS)) {
case ConstantRange::OverflowResult::MayOverflow:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index 2967230cea6174..6fced31a622d9d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
+# RUN: llc -run-pass=aarch64-prelegalizer-combiner -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s
---
name: add_unused
>From aed590ce214d0b5d8886fd6cce91ac0ec9d046fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Tue, 27 Feb 2024 18:28:23 +0100
Subject: [PATCH 4/5] remove else after return
---
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 348820ae186971..85e921bf68fda5 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6968,12 +6968,12 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
B.buildSAddo(Dst, Carry, RHS, LHS);
};
return true;
- } else {
- MatchInfo = [=](MachineIRBuilder &B) {
- B.buildUAddo(Dst, Carry, RHS, LHS);
- };
- return true;
}
+ // !IsSigned
+ MatchInfo = [=](MachineIRBuilder &B) {
+ B.buildUAddo(Dst, Carry, RHS, LHS);
+ };
+ return true;
}
std::optional<APInt> MaybeLHS = getConstantOrConstantSplatVector(LHS);
>From e6683631d8fb0661730bcdb45794b4b0b0df2acd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Tue, 12 Mar 2024 08:34:29 +0100
Subject: [PATCH 5/5] remove bitwidth
---
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 42 +++++--------------
1 file changed, 10 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 85e921bf68fda5..bee49dbd0f8380 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6982,21 +6982,9 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
// Fold addo(c1, c2) -> c3, carry.
if (MaybeLHS && MaybeRHS && isConstantLegalOrBeforeLegalizer(DstTy) &&
isConstantLegalOrBeforeLegalizer(CarryTy)) {
- // They must both have the same bitwidth. Otherwise APInt might
- // assert. Pre legalization, they may have widely different bitwidths.
- unsigned BitWidth =
- std::max(MaybeLHS->getBitWidth(), MaybeRHS->getBitWidth());
bool Overflow;
- APInt Result;
- if (IsSigned) {
- APInt LHS = MaybeLHS->sext(BitWidth);
- APInt RHS = MaybeRHS->sext(BitWidth);
- Result = LHS.sadd_ov(RHS, Overflow);
- } else {
- APInt LHS = MaybeLHS->zext(BitWidth);
- APInt RHS = MaybeRHS->zext(BitWidth);
- Result = LHS.uadd_ov(RHS, Overflow);
- }
+ APInt Result = IsSigned ? MaybeLHS->sadd_ov(*MaybeRHS, Overflow)
+ : MaybeLHS->uadd_ov(*MaybeRHS, Overflow);
MatchInfo = [=](MachineIRBuilder &B) {
B.buildConstant(Dst, Result);
B.buildConstant(Carry, Overflow);
@@ -7023,19 +7011,9 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
std::optional<APInt> MaybeAddRHS =
getConstantOrConstantSplatVector(AddLHS->getRHSReg());
if (MaybeAddRHS) {
- unsigned BitWidth =
- std::max(MaybeRHS->getBitWidth(), MaybeAddRHS->getBitWidth());
bool Overflow;
- APInt NewC;
- if (IsSigned) {
- APInt LHS = MaybeRHS->sext(BitWidth);
- APInt RHS = MaybeAddRHS->sext(BitWidth);
- NewC = LHS.sadd_ov(RHS, Overflow);
- } else {
- APInt LHS = MaybeRHS->zext(BitWidth);
- APInt RHS = MaybeAddRHS->zext(BitWidth);
- NewC = LHS.uadd_ov(RHS, Overflow);
- }
+ APInt NewC = IsSigned ? MaybeAddRHS->sadd_ov(*MaybeRHS, Overflow)
+ : MaybeAddRHS->uadd_ov(*MaybeRHS, Overflow);
if (!Overflow && isConstantLegalOrBeforeLegalizer(DstTy)) {
if (IsSigned) {
MatchInfo = [=](MachineIRBuilder &B) {
@@ -7043,13 +7021,13 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
B.buildSAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
};
return true;
- } else {
- MatchInfo = [=](MachineIRBuilder &B) {
- auto ConstRHS = B.buildConstant(DstTy, NewC);
- B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
- };
- return true;
}
+ // !IsSigned
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto ConstRHS = B.buildConstant(DstTy, NewC);
+ B.buildUAddo(Dst, Carry, AddLHS->getLHSReg(), ConstRHS);
+ };
+ return true;
}
}
};
More information about the llvm-commits
mailing list