[llvm] [GlobalIsel] Combine zext of trunc (episode II) (PR #108305)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 11 22:26:27 PDT 2024


https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/108305

>From 774a6ae62d67b8adbd74fe832fa1f552e9ad8ecc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Wed, 11 Sep 2024 17:33:46 +0200
Subject: [PATCH] [GlobalIsel] Combine zext of trunc (episode II)

The One with the Sonogram at the End

Either replace zext(trunc(x)) with x

or

If we're actually extending zero bits, then if
SrcSize <  DstSize: zext(a & mask)
SrcSize == DstSize: a & mask
SrcSize  > DstSize: trunc(a) & mask

Credits: https://reviews.llvm.org/D96031
         InstCombinerImpl::visitZExt
         LegalizationArtifactCombiner::tryCombineZExt

Test: AMDGPU/GlobalISel/combine-zext-trunc.mir
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |    7 +-
 .../include/llvm/Target/GlobalISel/Combine.td |   23 +-
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp |    4 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |   14 -
 .../GlobalISel/CombinerHelperCasts.cpp        |   91 ++
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |    2 +-
 .../AArch64/GlobalISel/arm64-atomic.ll        |   18 +-
 .../AArch64/GlobalISel/arm64-pcsections.ll    |   34 +-
 .../GlobalISel/combine-extract-vec-elt.mir    |    4 +-
 .../AArch64/GlobalISel/combine-select.mir     |   16 +-
 .../AArch64/GlobalISel/combine-with-flags.mir |   33 +-
 .../AArch64/GlobalISel/inline-memset.mir      |   47 +-
 .../prelegalizercombiner-extending-loads.mir  |    6 +-
 llvm/test/CodeGen/AArch64/aarch64-mops.ll     |    6 +-
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    |    8 +-
 llvm/test/CodeGen/AArch64/addsub_ext.ll       |   48 +-
 .../CodeGen/AArch64/arm64-subvector-extend.ll |   11 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    |  176 ++-
 llvm/test/CodeGen/AArch64/zext.ll             |  197 ++--
 llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll   |   39 +
 .../CodeGen/AMDGPU/GlobalISel/addsubu64.ll    |    8 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |   16 +-
 .../br-constant-invalid-sgpr-copy.ll          |    4 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll  |    4 +-
 ...mbine-shl-from-extend-narrow.postlegal.mir |    8 +-
 ...ombine-shl-from-extend-narrow.prelegal.mir |    8 +-
 .../AMDGPU/GlobalISel/combine-zext-trunc.mir  |  102 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |   14 +-
 .../GlobalISel/divergent-control-flow.ll      |    6 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  792 +++++++------
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  377 +++---
 ...licit-kernarg-backend-usage-global-isel.ll |   48 +-
 .../AMDGPU/GlobalISel/is-safe-to-sink-bug.ll  |   17 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll     |   43 +-
 .../GlobalISel/llvm.amdgcn.ballot.i32.ll      |   39 +
 .../GlobalISel/llvm.amdgcn.ballot.i64.ll      |   36 +
 .../GlobalISel/llvm.amdgcn.end.cf.i32.ll      |    6 +
 .../GlobalISel/llvm.amdgcn.end.cf.i64.ll      |    3 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   73 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |  579 ++++++----
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |    9 +
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  411 +++++--
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |   87 +-
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 1010 +++++++++--------
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |   21 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |  141 ++-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |  433 +++++--
 .../AMDGPU/GlobalISel/store-local.128.ll      |   85 +-
 .../AMDGPU/GlobalISel/store-local.96.ll       |   83 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll   |   39 +
 .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll |  236 ++++
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |    5 +-
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  295 ++---
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |   17 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |  236 ++++
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   |    9 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll   |   11 +-
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |   24 +
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |   12 +
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |   87 +-
 .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll   |   46 +-
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll  |   40 +-
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll  |    9 +
 .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll  |   13 +
 .../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll   |   13 +
 .../llvm.amdgcn.pops.exiting.wave.id.ll       |   53 +-
 .../AMDGPU/pseudo-scalar-transcendental.ll    |  183 ++-
 .../CodeGen/AMDGPU/scratch-pointer-sink.ll    |    4 +
 68 files changed, 4338 insertions(+), 2241 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 828532dcffb7d3..bf32dcf5f2c85a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -387,9 +387,6 @@ class CombinerHelper {
   /// Transform anyext(trunc(x)) to x.
   bool matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg);
 
-  /// Transform zext(trunc(x)) to x.
-  bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg);
-
   /// Transform trunc (shl x, K) to shl (trunc x), K
   ///    if K < VT.getScalarSizeInBits().
   ///
@@ -909,6 +906,10 @@ class CombinerHelper {
   bool matchCastOfBuildVector(const MachineInstr &CastMI,
                               const MachineInstr &BVMI, BuildFnTy &MatchInfo);
 
+  /// Transform zext of truncate to x or and(x, mask).
+  bool matchCombineZextTrunc(const MachineInstr &ZextMI,
+                             const MachineInstr &TruncMI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index a595a51d7b01ff..587dbe20e94c35 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -758,15 +758,6 @@ def anyext_trunc_fold: GICombineRule <
   (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
 >;
 
-// Fold (zext (trunc x)) -> x if the source type is same as the destination type
-// and truncated bits are known to be zero.
-def zext_trunc_fold: GICombineRule <
-  (defs root:$root, register_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_ZEXT):$root,
-         [{ return Helper.matchCombineZextTrunc(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }])
->;
-
 def not_cmp_fold_matchinfo : GIDefMatchData<"SmallVector<Register, 4>">;
 def not_cmp_fold : GICombineRule<
   (defs root:$d, not_cmp_fold_matchinfo:$info),
@@ -1894,6 +1885,15 @@ class integer_of_opcode<Instruction castOpcode> : GICombineRule <
 
 def integer_of_truncate : integer_of_opcode<G_TRUNC>;
 
+/// Transform zext of truncate to x or and(x, mask).
+def zext_of_truncate : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_TRUNC $trunc, $src):$TruncMI,
+         (G_ZEXT $root, $trunc):$ZextMI,
+         [{ return Helper.matchCombineZextTrunc(*${ZextMI}, *${TruncMI}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${ZextMI}, ${matchinfo}); }])>;
+
+
 def cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
@@ -1915,7 +1915,8 @@ def cast_combines: GICombineGroup<[
   narrow_binop_and,
   narrow_binop_or,
   narrow_binop_xor,
-  integer_of_truncate
+  integer_of_truncate,
+  zext_of_truncate
 ]>;
 
 
@@ -1951,7 +1952,7 @@ def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
 
 def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
-  zext_trunc_fold, icmp_to_true_false_known_bits, icmp_to_lhs_known_bits,
+  icmp_to_true_false_known_bits, icmp_to_lhs_known_bits,
   sext_inreg_to_zext_inreg]>;
 
 def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 547529bbe699ab..5addf93599085a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -333,8 +333,10 @@ MachineInstrBuilder CSEMIRBuilder::buildConstant(const DstOp &Res,
 
   // For vectors, CSE the element only for now.
   LLT Ty = Res.getLLTTy(*getMRI());
-  if (Ty.isVector())
+  if (Ty.isFixedVector())
     return buildSplatBuildVector(Res, buildConstant(Ty.getElementType(), Val));
+  if (Ty.isScalableVector())
+    return buildSplatVector(Res, buildConstant(Ty.getElementType(), Val));
 
   FoldingSetNodeID ID;
   GISelInstProfileBuilder ProfBuilder(ID, *getMRI());
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index df9c12bc9c97bd..14d4e413456403 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2524,20 +2524,6 @@ bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
                   m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))));
 }
 
-bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI, Register &Reg) {
-  assert(MI.getOpcode() == TargetOpcode::G_ZEXT && "Expected a G_ZEXT");
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  if (mi_match(SrcReg, MRI,
-               m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))))) {
-    unsigned DstSize = DstTy.getScalarSizeInBits();
-    unsigned SrcSize = MRI.getType(SrcReg).getScalarSizeInBits();
-    return KB->getKnownBits(Reg).countMinLeadingZeros() >= DstSize - SrcSize;
-  }
-  return false;
-}
-
 static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
   const unsigned ShiftSize = ShiftTy.getScalarSizeInBits();
   const unsigned TruncSize = TruncTy.getScalarSizeInBits();
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 30557e6a2304e6..2171f2f6feb7eb 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -359,3 +359,94 @@ bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI,
     return false;
   }
 }
+
+bool CombinerHelper::matchCombineZextTrunc(const MachineInstr &ZextMI,
+                                           const MachineInstr &TruncMI,
+                                           BuildFnTy &MatchInfo) {
+  const GZext *Zext = cast<GZext>(&ZextMI);
+  const GTrunc *Trunc = cast<GTrunc>(&TruncMI);
+
+  Register Dst = Zext->getReg(0);
+  Register Mid = Zext->getSrcReg();
+  Register Src = Trunc->getSrcReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (!MRI.hasOneNonDBGUse(Mid))
+    return false;
+
+  unsigned DstSize = DstTy.getScalarSizeInBits();
+  unsigned MidSize = MRI.getType(Mid).getScalarSizeInBits();
+  unsigned SrcSize = SrcTy.getScalarSizeInBits();
+
+  // Are the truncated bits known to be zero?
+  if (DstTy == SrcTy &&
+      (KB->getKnownBits(Src).countMinLeadingZeros() >= DstSize - MidSize)) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); };
+    return true;
+  }
+
+  // If the sizes are just right we can convert this into a logical
+  // 'and', which will be much cheaper than the pair of casts.
+
+  // If we're actually extending zero bits, then if
+  // SrcSize <  DstSize: zext(Src & mask)
+  // SrcSize == DstSize: Src & mask
+  // SrcSize  > DstSize: trunc(Src) & mask
+
+  if (DstSize == SrcSize) {
+    // Src & mask.
+
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {DstTy}}) ||
+        !isConstantLegalOrBeforeLegalizer(DstTy))
+      return false;
+
+    // build mask.
+    APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto Mask = B.buildConstant(DstTy, AndValue);
+      B.buildAnd(Dst, Src, Mask);
+    };
+    return true;
+  }
+
+  if (SrcSize < DstSize) {
+    // zext(Src & mask).
+
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {SrcTy}}) ||
+        !isConstantLegalOrBeforeLegalizer(SrcTy) ||
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}}))
+      return false;
+
+    APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto Mask = B.buildConstant(SrcTy, AndValue);
+      auto And = B.buildAnd(SrcTy, Src, Mask);
+      B.buildZExt(Dst, And);
+    };
+    return true;
+  }
+
+  if (SrcSize > DstSize) {
+    // trunc(Src) & mask.
+
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_AND, {DstTy}}) ||
+        !isConstantLegalOrBeforeLegalizer(DstTy) ||
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}}))
+      return false;
+
+    APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+
+    MatchInfo = [=](MachineIRBuilder &B) {
+      auto Mask = B.buildConstant(DstTy, AndValue);
+      auto Trunc = B.buildTrunc(DstTy, Src);
+      B.buildAnd(Dst, Trunc, Mask);
+    };
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..25db0e678f49ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -168,6 +168,6 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
 def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
-   zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+   int_minmax_to_med3, ptr_add_immed_chain,
    fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index de3f323891a36a..ddcc31d23b56d2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -1938,14 +1938,14 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
 define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O1-LABEL: atomicrmw_xchg_i8:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
 ; CHECK-NOLSE-O1-NEXT:  LBB28_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O1-NEXT:    ldxrb w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    stxrb w9, w1, [x0]
+; CHECK-NOLSE-O1-NEXT:    ldxrb w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    stxrb w9, w1, [x8]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB28_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i8:
@@ -2993,14 +2993,14 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
 define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O1-LABEL: atomicrmw_xchg_i16:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NOLSE-O1-NEXT:    mov x8, x0
 ; CHECK-NOLSE-O1-NEXT:  LBB38_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NOLSE-O1-NEXT:    ldxrh w8, [x0]
-; CHECK-NOLSE-O1-NEXT:    stxrh w9, w1, [x0]
+; CHECK-NOLSE-O1-NEXT:    ldxrh w0, [x8]
+; CHECK-NOLSE-O1-NEXT:    stxrh w9, w1, [x8]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w9, LBB38_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.2: ; %atomicrmw.end
-; CHECK-NOLSE-O1-NEXT:    mov w0, w8
+; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-OUTLINE-O1-LABEL: atomicrmw_xchg_i16:
@@ -5996,7 +5996,6 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O1-LABEL: cmpxchg_i8:
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    mov x8, x0
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
 ; CHECK-NOLSE-O1-NEXT:  LBB67_1: ; %cmpxchg.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrb w0, [x8]
@@ -6103,7 +6102,6 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O1-LABEL: cmpxchg_i16:
 ; CHECK-NOLSE-O1:       ; %bb.0:
 ; CHECK-NOLSE-O1-NEXT:    mov x8, x0
-; CHECK-NOLSE-O1-NEXT:    ; kill: def $w2 killed $w2 def $x2
 ; CHECK-NOLSE-O1-NEXT:  LBB68_1: ; %cmpxchg.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldxrh w0, [x8]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index c6819ff39ed33e..c02390c4df12dd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -746,20 +746,20 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w1 = KILL $w1, implicit-def $x1
+  ; CHECK-NEXT:   $x8 = ORRXrs $xzr, $x0, 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.atomicrmw.start:
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT:   liveins: $w1, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w1, renamable $x8, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = KILL renamable $w0, implicit killed $x0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw xchg ptr %ptr, i8 %rhs monotonic, !pcsections !0
   ret i8 %res
@@ -999,20 +999,20 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w1 = KILL $w1, implicit-def $x1
+  ; CHECK-NEXT:   $x8 = ORRXrs $xzr, $x0, 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.atomicrmw.start:
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT:   liveins: $w1, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w1, renamable $x8, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.atomicrmw.end:
-  ; CHECK-NEXT:   liveins: $x8
+  ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8
+  ; CHECK-NEXT:   $w0 = KILL renamable $w0, implicit killed $x0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %res = atomicrmw xchg ptr %ptr, i16 %rhs monotonic, !pcsections !0
   ret i16 %res
@@ -1229,11 +1229,10 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x8 = ORRXrs $xzr, $x0, 0
-  ; CHECK-NEXT:   renamable $w2 = KILL $w2, implicit-def $x2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
-  ; CHECK-NEXT:   liveins: $w1, $x2, $x8
+  ; CHECK-NEXT:   liveins: $w1, $w2, $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 7, pcsections !0
@@ -1242,7 +1241,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.cmpxchg.trystore:
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.1(0x7c000000)
-  ; CHECK-NEXT:   liveins: $w1, $x0, $x2, $x8
+  ; CHECK-NEXT:   liveins: $w1, $w2, $x0, $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w2, renamable $x8, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1
@@ -1272,11 +1271,10 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x8 = ORRXrs $xzr, $x0, 0
-  ; CHECK-NEXT:   renamable $w2 = KILL $w2, implicit-def $x2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
-  ; CHECK-NEXT:   liveins: $w1, $x2, $x8
+  ; CHECK-NEXT:   liveins: $w1, $w2, $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 15, pcsections !0
@@ -1285,7 +1283,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.cmpxchg.trystore:
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.1(0x7c000000)
-  ; CHECK-NEXT:   liveins: $w1, $x0, $x2, $x8
+  ; CHECK-NEXT:   liveins: $w1, $w2, $x0, $x8
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w2, renamable $x8, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
index c98dcf6ccb7966..f29fa86123c8c4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -49,8 +49,8 @@ body:             |
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %arg1:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64)
-    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[TRUNC]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: %zext:_(s64) = G_AND %arg1, [[C]]
     ; CHECK-NEXT: $x0 = COPY %zext(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %arg1:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 86fa12aa064acb..3e98a5e8e88009 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -361,10 +361,11 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
-    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[ZEXT]], %one
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[AND]], %one
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -417,10 +418,11 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
-    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[AND]], [[C1]](s8)
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
index 8cb44605246ffa..7e4fec1da9c278 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
@@ -10,7 +10,9 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: $x1 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
     %0:_(s64) = COPY $x0
     %2:_(s32) = nuw G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -25,9 +27,9 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
-    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
     %0:_(s64) = COPY $x0
     %2:_(s32) = nsw G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -42,9 +44,9 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
-    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
     %0:_(s64) = COPY $x0
     %2:_(s32) = G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -300,7 +302,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: %sv0:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[COPY]](s64)
-    ; CHECK-NEXT: $z0 = COPY %sv0(<vscale x 2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: %z:_(<vscale x 2 x s64>) = G_AND %sv0, [[SPLAT_VECTOR]]
+    ; CHECK-NEXT: $z0 = COPY %z(<vscale x 2 x s64>)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %sv0:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %0:_(s64)
@@ -317,8 +322,10 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK-NEXT: %2:_(s64) = nneg G_ZEXT [[COPY]](s32)
-    ; CHECK-NEXT: $x1 = COPY %2(s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32)
+    ; CHECK-NEXT: $x1 = COPY [[ZEXT]](s64)
     %0:_(s32) = COPY $w0
     %2:_(s16) = nuw G_TRUNC %0
     %3:_(s64) = G_ZEXT  %2
@@ -333,8 +340,10 @@ body:             |
     ; CHECK: liveins: $w0, $w1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nuw G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: $w1 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $w1 = COPY [[AND]](s32)
     %0:_(s64) = COPY $x0
     %2:_(s16) = nuw G_TRUNC %0
     %3:_(s32) = G_ZEXT  %2
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index 9ed1e2d9eee3b4..aeaf636d14724b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -94,13 +94,14 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
-    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C1]]
     ; CHECK-NEXT: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; CHECK-NEXT: G_STORE [[MUL]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
@@ -183,20 +184,21 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
-    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C1]]
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL]](s64)
     ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[PTR_ADD]](p0) :: (store (<2 x s64>) into %ir.dst + 16, align 1)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
     ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[PTR_ADD1]](p0) :: (store (<2 x s64>) into %ir.dst + 32, align 1)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into %ir.dst + 44, align 1)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
@@ -248,13 +250,14 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
-    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C1]]
     ; CHECK-NEXT: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; CHECK-NEXT: G_STORE [[MUL]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir
index 47c85f76785935..48cc1660fe0030 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads.mir
@@ -165,9 +165,9 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[SEXTLOAD]](s32)
-    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
-    ; CHECK-NEXT: $w0 = COPY [[ZEXT]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXTLOAD]], [[C]]
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
     ; CHECK-NEXT: $w1 = COPY [[SEXTLOAD]](s32)
     %0:_(p0) = COPY $x0
     %1:_(s8) = G_LOAD %0 :: (load (s8))
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index ff7872c922e32f..fa44e033c7d0ac 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -432,9 +432,8 @@ define void @memset_10(ptr %dst, i32 %value) {
 ;
 ; GISel-WITHOUT-MOPS-O3-LABEL: memset_10:
 ; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; GISel-WITHOUT-MOPS-O3-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
-; GISel-WITHOUT-MOPS-O3-NEXT:    and x9, x1, #0xff
+; GISel-WITHOUT-MOPS-O3-NEXT:    and w9, w1, #0xff
 ; GISel-WITHOUT-MOPS-O3-NEXT:    mul x8, x9, x8
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x8, [x0]
 ; GISel-WITHOUT-MOPS-O3-NEXT:    strh w8, [x0, #8]
@@ -454,9 +453,8 @@ define void @memset_10(ptr %dst, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; GISel-MOPS-O3-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
-; GISel-MOPS-O3-NEXT:    and x9, x1, #0xff
+; GISel-MOPS-O3-NEXT:    and w9, w1, #0xff
 ; GISel-MOPS-O3-NEXT:    mul x8, x9, x8
 ; GISel-MOPS-O3-NEXT:    str x8, [x0]
 ; GISel-MOPS-O3-NEXT:    strh w8, [x0, #8]
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index d677526bab0005..cd3e638043a888 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1750,11 +1750,11 @@ define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
 ;
 ; CHECK-GI-LABEL: umull_smaller_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    movi v3.8h, #15
+; CHECK-GI-NEXT:    movi v2.8b, #15
+; CHECK-GI-NEXT:    movi v3.2d, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/addsub_ext.ll b/llvm/test/CodeGen/AArch64/addsub_ext.ll
index 04a98bd5088803..81eae261f26477 100644
--- a/llvm/test/CodeGen/AArch64/addsub_ext.ll
+++ b/llvm/test/CodeGen/AArch64/addsub_ext.ll
@@ -24,22 +24,12 @@ define i32 @add_z_shli8i32(i8 %v, i32 %lhs) minsize {
 }
 
 define i64 @add_z_i8i64(i8 %v, i64 %lhs) minsize {
-; CHECK-LABEL: add_z_i8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    add x0, x1, w0, uxtb
-; CHECK-NEXT:    ret
   %vz = zext i8 %v to i64
   %r = add i64 %lhs, %vz
   ret i64 %r
 }
 
 define i64 @add_z_shli8i64(i8 %v, i64 %lhs) minsize {
-; CHECK-LABEL: add_z_shli8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    add x0, x1, w0, uxtb #3
-; CHECK-NEXT:    ret
   %vz = zext i8 %v to i64
   %s = shl i64 %vz, 3
   %r = add i64 %lhs, %s
@@ -112,22 +102,12 @@ define i32 @add_z_shli16i32(i16 %v, i32 %lhs) minsize {
 }
 
 define i64 @add_z_i16i64(i16 %v, i64 %lhs) minsize {
-; CHECK-LABEL: add_z_i16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    add x0, x1, w0, uxth
-; CHECK-NEXT:    ret
   %vz = zext i16 %v to i64
   %r = add i64 %lhs, %vz
   ret i64 %r
 }
 
 define i64 @add_z_shli16i64(i16 %v, i64 %lhs) minsize {
-; CHECK-LABEL: add_z_shli16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    add x0, x1, w0, uxth #3
-; CHECK-NEXT:    ret
   %vz = zext i16 %v to i64
   %s = shl i64 %vz, 3
   %r = add i64 %lhs, %s
@@ -242,22 +222,12 @@ define i32 @sub_z_shli8i32(i8 %v, i32 %lhs) minsize {
 }
 
 define i64 @sub_z_i8i64(i8 %v, i64 %lhs) minsize {
-; CHECK-LABEL: sub_z_i8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sub x0, x1, w0, uxtb
-; CHECK-NEXT:    ret
   %vz = zext i8 %v to i64
   %r = sub i64 %lhs, %vz
   ret i64 %r
 }
 
 define i64 @sub_z_shli8i64(i8 %v, i64 %lhs) minsize {
-; CHECK-LABEL: sub_z_shli8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sub x0, x1, w0, uxtb #3
-; CHECK-NEXT:    ret
   %vz = zext i8 %v to i64
   %s = shl i64 %vz, 3
   %r = sub i64 %lhs, %s
@@ -330,22 +300,12 @@ define i32 @sub_z_shli16i32(i16 %v, i32 %lhs) minsize {
 }
 
 define i64 @sub_z_i16i64(i16 %v, i64 %lhs) minsize {
-; CHECK-LABEL: sub_z_i16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sub x0, x1, w0, uxth
-; CHECK-NEXT:    ret
   %vz = zext i16 %v to i64
   %r = sub i64 %lhs, %vz
   ret i64 %r
 }
 
 define i64 @sub_z_shli16i64(i16 %v, i64 %lhs) minsize {
-; CHECK-LABEL: sub_z_shli16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sub x0, x1, w0, uxth #3
-; CHECK-NEXT:    ret
   %vz = zext i16 %v to i64
   %s = shl i64 %vz, 3
   %r = sub i64 %lhs, %s
@@ -444,7 +404,7 @@ define i32 @cmp_s_i8i32(i8 %v, i32 %lhs) minsize {
 ; CHECK-NEXT:    cmp w1, w0, uxtb
 ; CHECK-NEXT:    b.ge .LBB40_2
 ; CHECK-NEXT:  // %bb.1: // %then
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB40_2: // %end
 ; CHECK-NEXT:    mov w0, w1
@@ -465,7 +425,7 @@ define i64 @cmp_s_i8i64(i8 %v, i64 %lhs) minsize {
 ; CHECK-NEXT:    cmp x1, w0, sxtb
 ; CHECK-NEXT:    b.ge .LBB41_2
 ; CHECK-NEXT:  // %bb.1: // %then
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB41_2: // %end
 ; CHECK-NEXT:    mov x0, x1
@@ -485,7 +445,7 @@ define i32 @cmp_s_i16i32(i16 %v, i32 %lhs) minsize {
 ; CHECK-NEXT:    cmp w1, w0, uxth
 ; CHECK-NEXT:    b.ge .LBB42_2
 ; CHECK-NEXT:  // %bb.1: // %then
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB42_2: // %end
 ; CHECK-NEXT:    mov w0, w1
@@ -506,7 +466,7 @@ define i64 @cmp_s_i16i64(i16 %v, i64 %lhs) minsize {
 ; CHECK-NEXT:    cmp x1, w0, sxth
 ; CHECK-NEXT:    b.ge .LBB43_2
 ; CHECK-NEXT:  // %bb.1: // %then
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB43_2: // %end
 ; CHECK-NEXT:    mov x0, x1
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index a6a825b26b3b52..779b4414d23af6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -241,12 +241,11 @@ define <4 x i64> @zext_v4i8_to_v4i64(<4 x i8> %v0) nounwind {
 ;
 ; CHECK-GI-LABEL: zext_v4i8_to_v4i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
-; CHECK-GI-NEXT:    movi.2d v1, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll.2d v2, v0, #0
-; CHECK-GI-NEXT:    ushll2.2d v3, v0, #0
-; CHECK-GI-NEXT:    and.16b v0, v2, v1
-; CHECK-GI-NEXT:    and.16b v1, v3, v1
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and.8b v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v1, v0, #0
+; CHECK-GI-NEXT:    ushll.2d v0, v1, #0
+; CHECK-GI-NEXT:    ushll2.2d v1, v1, #0
 ; CHECK-GI-NEXT:    ret
   %r = zext <4 x i8> %v0 to <4 x i64>
   ret <4 x i64> %r
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index f46e6ae989ff20..467d7b908f263f 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -263,23 +263,14 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
-; CHECK-SD-LABEL: add_v2i16_v2i64_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    addp d0, v0.2d
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v2i16_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -475,8 +466,7 @@ define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
 ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    uaddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
@@ -630,7 +620,7 @@ define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    uaddlv h0, v0.16b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    and x0, x8, #0xffff
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -687,7 +677,7 @@ define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    uaddlv h0, v0.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    and x0, x8, #0xffff
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -734,8 +724,7 @@ define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
 ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    uaddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    and x0, x8, #0xffff
+; CHECK-GI-NEXT:    mov w0, v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
@@ -772,23 +761,14 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
-; CHECK-SD-LABEL: add_v2i8_v2i64_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    addp d0, v0.2d
-; CHECK-SD-NEXT:    fmov x0, d0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x0, d0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v2i8_v2i64_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1083,25 +1063,15 @@ entry:
 }
 
 define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
-; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    addp d0, v0.2d
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    add x0, x8, x0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -1318,7 +1288,7 @@ define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    uaddlv s0, v0.4h
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add w0, w0, w8, uxth
+; CHECK-GI-NEXT:    add w0, w8, w0
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
@@ -1478,7 +1448,8 @@ define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    uaddlv h0, v0.16b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add x0, x0, w8, uxth
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, x0
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -1539,7 +1510,8 @@ define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    uaddlv h0, v0.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add x0, x0, w8, uxth
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, x0
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -1591,7 +1563,7 @@ define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    uaddlv s0, v0.4h
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    add x0, x0, w8, uxth
+; CHECK-GI-NEXT:    add x0, x0, w8, uxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
@@ -1631,25 +1603,15 @@ entry:
 }
 
 define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
-; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    addp d0, v0.2d
-; CHECK-SD-NEXT:    fmov x8, d0
-; CHECK-SD-NEXT:    add x0, x8, x0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    addp d0, v0.2d
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    add x0, x8, x0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x0, x8, x0
+; CHECK-NEXT:    ret
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@@ -2788,11 +2750,11 @@ define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x0000000000ffff
+; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    addp d1, v1.2d
 ; CHECK-GI-NEXT:    fmov x8, d0
@@ -3085,14 +3047,13 @@ define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-GI-NEXT:    uaddlv s1, v1.4h
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    uaddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    and w8, w8, #0xffff
-; CHECK-GI-NEXT:    add w0, w8, w9, uxth
+; CHECK-GI-NEXT:    uaddlv s1, v1.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i32>
@@ -3306,12 +3267,13 @@ define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddlv h1, v1.16b
 ; CHECK-GI-NEXT:    uaddlv h0, v0.16b
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    and x8, x8, #0xffff
-; CHECK-GI-NEXT:    add x0, x8, w9, uxth
+; CHECK-GI-NEXT:    uaddlv h1, v1.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    and w9, w9, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
@@ -3396,12 +3358,13 @@ define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    uaddlv h1, v1.8b
 ; CHECK-GI-NEXT:    uaddlv h0, v0.8b
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    and x8, x8, #0xffff
-; CHECK-GI-NEXT:    add x0, x8, w9, uxth
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    and w8, w8, #0xffff
+; CHECK-GI-NEXT:    and w9, w9, #0xffff
+; CHECK-GI-NEXT:    add x0, x8, x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <8 x i8> %x to <8 x i64>
@@ -3470,10 +3433,9 @@ define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    uaddlv s1, v1.4h
 ; CHECK-GI-NEXT:    uaddlv s0, v0.4h
-; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
 ; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    and x8, x8, #0xffff
-; CHECK-GI-NEXT:    add x0, x8, w9, uxth
+; CHECK-GI-NEXT:    add x0, x8, w9, uxtw
 ; CHECK-GI-NEXT:    ret
 entry:
   %xx = zext <4 x i8> %x to <4 x i64>
@@ -3542,11 +3504,11 @@ define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
 ;
 ; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    addp d1, v1.2d
 ; CHECK-GI-NEXT:    fmov x8, d0
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 7e95b6684e8211..269bbe8bb585a4 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -25,11 +25,16 @@ entry:
 }
 
 define i64 @zext_i8_to_i64(i8 %a) {
-; CHECK-LABEL: zext_i8_to_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x0, x0, #0xff
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: zext_i8_to_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    and x0, x0, #0xff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: zext_i8_to_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w0, w0, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %c = zext i8 %a to i64
   ret i64 %c
@@ -56,11 +61,16 @@ entry:
 }
 
 define i64 @zext_i16_to_i64(i16 %a) {
-; CHECK-LABEL: zext_i16_to_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x0, x0, #0xffff
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: zext_i16_to_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    and x0, x0, #0xffff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: zext_i16_to_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w0, w0, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %c = zext i16 %a to i64
   ret i64 %c
@@ -97,22 +107,36 @@ entry:
 }
 
 define i64 @zext_i10_to_i64(i10 %a) {
-; CHECK-LABEL: zext_i10_to_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x0, x0, #0x3ff
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: zext_i10_to_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    and x0, x0, #0x3ff
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: zext_i10_to_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w0, w0, #0x3ff
+; CHECK-GI-NEXT:    ret
 entry:
   %c = zext i10 %a to i64
   ret i64 %c
 }
 
 define <2 x i16> @zext_v2i8_v2i16(<2 x i8> %a) {
-; CHECK-LABEL: zext_v2i8_v2i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: zext_v2i8_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: zext_v2i8_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <2 x i8> %a to <2 x i16>
   ret <2 x i16> %c
@@ -130,19 +154,12 @@ entry:
 }
 
 define <2 x i64> @zext_v2i8_v2i64(<2 x i8> %a) {
-; CHECK-SD-LABEL: zext_v2i8_v2i64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zext_v2i8_v2i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zext_v2i8_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
 entry:
   %c = zext <2 x i8> %a to <2 x i64>
   ret <2 x i64> %c
@@ -160,19 +177,12 @@ entry:
 }
 
 define <2 x i64> @zext_v2i16_v2i64(<2 x i16> %a) {
-; CHECK-SD-LABEL: zext_v2i16_v2i64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zext_v2i16_v2i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zext_v2i16_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
 entry:
   %c = zext <2 x i16> %a to <2 x i64>
   ret <2 x i64> %c
@@ -189,11 +199,20 @@ entry:
 }
 
 define <2 x i16> @zext_v2i10_v2i16(<2 x i10> %a) {
-; CHECK-LABEL: zext_v2i10_v2i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2s, #3, msl #8
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: zext_v2i10_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.2s, #3, msl #8
+; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: zext_v2i10_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #1023 // =0x3ff
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <2 x i10> %a to <2 x i16>
   ret <2 x i16> %c
@@ -211,20 +230,12 @@ entry:
 }
 
 define <2 x i64> @zext_v2i10_v2i64(<2 x i10> %a) {
-; CHECK-SD-LABEL: zext_v2i10_v2i64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi v1.2s, #3, msl #8
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: zext_v2i10_v2i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI18_0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: zext_v2i10_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2s, #3, msl #8
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ret
 entry:
   %c = zext <2 x i10> %a to <2 x i64>
   ret <2 x i64> %c
@@ -498,9 +509,9 @@ define <4 x i32> @zext_v4i8_v4i32(<4 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v4i8_v4i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <4 x i8> %a to <4 x i32>
@@ -518,12 +529,11 @@ define <4 x i64> @zext_v4i8_v4i64(<4 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v4i8_v4i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v3.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <4 x i8> %a to <4 x i64>
@@ -602,9 +612,9 @@ define <4 x i32> @zext_v4i10_v4i32(<4 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v4i10_v4i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.4s, #3, msl #8
+; CHECK-GI-NEXT:    mvni v1.4h, #252, lsl #8
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <4 x i10> %a to <4 x i32>
@@ -622,13 +632,11 @@ define <4 x i64> @zext_v4i10_v4i64(<4 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v4i10_v4i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    and v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mvni v1.4h, #252, lsl #8
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <4 x i10> %a to <4 x i64>
@@ -785,11 +793,10 @@ define <8 x i32> @zext_v8i10_v8i32(<8 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v8i10_v8i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.4s, #3, msl #8
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v1.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    mvni v1.8h, #252, lsl #8
+; CHECK-GI-NEXT:    and v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i10> %a to <8 x i32>
@@ -810,18 +817,14 @@ define <8 x i64> @zext_v8i10_v8i64(<8 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v8i10_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mvni v1.8h, #252, lsl #8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    adrp x8, .LCPI45_0
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI45_0]
-; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v5.2d, v0.4s, #0
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    and v2.16b, v4.16b, v3.16b
-; CHECK-GI-NEXT:    and v3.16b, v5.16b, v3.16b
+; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i10> %a to <8 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index ff5880819020da..355a541f4de87a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -449,6 +449,7 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s0, s0, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX7-NEXT:    s_and_b32 s1, s1, 1
 ; GFX7-NEXT:    s_add_i32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -456,6 +457,7 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -463,6 +465,7 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
   %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
@@ -477,27 +480,48 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_uaddo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s0, s0, s2
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
 ; GFX7-NEXT:    s_add_u32 s0, s0, s2
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_uaddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
   %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
@@ -515,6 +539,8 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_add_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_and_b32 s3, s3, 1
 ; GFX7-NEXT:    s_add_i32 s0, s0, s2
 ; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -525,6 +551,8 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_add_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_add_i32 s0, s0, s2
 ; GFX8-NEXT:    s_add_i32 s1, s1, s3
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -535,6 +563,8 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_add_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_add_i32 s0, s0, s2
 ; GFX9-NEXT:    s_add_i32 s1, s1, s3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -679,6 +709,9 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_saddo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_add_u32 s4, s0, s2
+; GFX7-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX7-NEXT:    s_and_b32 s5, s5, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
@@ -696,6 +729,9 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-LABEL: s_saddo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -713,6 +749,9 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-LABEL: s_saddo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index 359c1e53de99e3..b975aadf74ce49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -11,6 +11,10 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s0, s6, s0
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-NEXT:    s_addc_u32 s1, s7, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -63,6 +67,10 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s0, s6, s0
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-NEXT:    s_subb_u32 s1, s7, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..9d454ee8b5d826 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1847,11 +1847,13 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GCN-NEXT:    s_ashr_i32 s7, s5, 31
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
+; GCN-NEXT:    s_and_b32 s8, s11, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
+; GCN-NEXT:    s_and_b32 s4, s12, 1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_cselect_b32 s2, s6, s7
 ; GCN-NEXT:    ; return to shader part epilog
 ;
@@ -1868,13 +1870,15 @@ define amdgpu_ps i65 @s_ashr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
 ; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[4:5], s3
 ; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT:    s_and_b32 s8, s11, 1
 ; GFX10PLUS-NEXT:    s_ashr_i32 s3, s5, 31
 ; GFX10PLUS-NEXT:    s_ashr_i64 s[4:5], s[4:5], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_and_b32 s6, s12, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, %amount
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
index 439ffbac960b8c..df0401a1a51739 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
@@ -12,6 +12,7 @@ define void @br_false() {
 ; WAVE64-NEXT:  .LBB0_1: ; %bb0
 ; WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE64-NEXT:    s_mov_b32 s4, 1
+; WAVE64-NEXT:    s_and_b32 s4, s4, 1
 ; WAVE64-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE64-NEXT:    s_cbranch_scc1 .LBB0_1
 ; WAVE64-NEXT:  ; %bb.2: ; %.exit5
@@ -23,6 +24,7 @@ define void @br_false() {
 ; WAVE32-NEXT:  .LBB0_1: ; %bb0
 ; WAVE32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE32-NEXT:    s_mov_b32 s4, 1
+; WAVE32-NEXT:    s_and_b32 s4, s4, 1
 ; WAVE32-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE32-NEXT:    s_cbranch_scc1 .LBB0_1
 ; WAVE32-NEXT:  ; %bb.2: ; %.exit5
@@ -44,6 +46,7 @@ define void @br_true() {
 ; WAVE64-NEXT:  .LBB1_1: ; %bb0
 ; WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE64-NEXT:    s_mov_b32 s4, 0
+; WAVE64-NEXT:    s_and_b32 s4, s4, 1
 ; WAVE64-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE64-NEXT:    s_cbranch_scc1 .LBB1_1
 ; WAVE64-NEXT:  ; %bb.2: ; %.exit5
@@ -55,6 +58,7 @@ define void @br_true() {
 ; WAVE32-NEXT:  .LBB1_1: ; %bb0
 ; WAVE32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE32-NEXT:    s_mov_b32 s4, 0
+; WAVE32-NEXT:    s_and_b32 s4, s4, 1
 ; WAVE32-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE32-NEXT:    s_cbranch_scc1 .LBB1_1
 ; WAVE32-NEXT:  ; %bb.2: ; %.exit5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 132dc876b3b054..f985760627942d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -453,8 +453,8 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
 ; GFX7-NEXT:    s_lshl_b32 s2, s1, 8
 ; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80008
 ; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX7-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6a291510fe66c1..3041d79295a657 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -413,9 +413,9 @@ body:             |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0
-    ; GFX6-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32)
     ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX6-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16)
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX6-NEXT: %extend:_(s32) = G_AND %argument, [[C]]
     ; GFX6-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16)
     ; GFX6-NEXT: $vgpr0 = COPY %shl(s32)
     ;
@@ -423,9 +423,9 @@ body:             |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32)
     ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX9-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX9-NEXT: %extend:_(s32) = G_AND %argument, [[C]]
     ; GFX9-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16)
     ; GFX9-NEXT: $vgpr0 = COPY %shl(s32)
     %argument:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..4256bb849664a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -285,9 +285,9 @@ body:             |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0
-    ; GFX6-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32)
     ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX6-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16)
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX6-NEXT: %extend:_(s32) = G_AND %argument, [[C]]
     ; GFX6-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16)
     ; GFX6-NEXT: $vgpr0 = COPY %shl(s32)
     ;
@@ -295,9 +295,9 @@ body:             |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: %truncate:_(s16) = G_TRUNC %argument(s32)
     ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
-    ; GFX9-NEXT: %extend:_(s32) = G_ZEXT %truncate(s16)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX9-NEXT: %extend:_(s32) = G_AND %argument, [[C]]
     ; GFX9-NEXT: %shl:_(s32) = G_SHL %extend, %shiftamt(s16)
     ; GFX9-NEXT: $vgpr0 = COPY %shl(s32)
     %argument:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
index 3423af64162e52..7ba82d5f4d1fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
@@ -36,8 +36,8 @@ body: |
     ; GCN: liveins: $vgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
-    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
-    ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GCN-NEXT: %zext:_(s32) = G_AND %var, [[C]]
     ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
     %var:_(s32) = COPY $vgpr0
     %cFFFFF:_(s32) = G_CONSTANT i32 1048575
@@ -136,8 +136,9 @@ body: |
     ; GCN-NEXT: %c7FFF:_(s32) = G_CONSTANT i32 32767
     ; GCN-NEXT: %c:_(<2 x s32>) = G_BUILD_VECTOR %cFFFFF(s32), %c7FFF(s32)
     ; GCN-NEXT: %low_bits:_(<2 x s32>) = G_AND %var, %c
-    ; GCN-NEXT: %trunc:_(<2 x s16>) = G_TRUNC %low_bits(<2 x s32>)
-    ; GCN-NEXT: %zext:_(<2 x s32>) = G_ZEXT %trunc(<2 x s16>)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
+    ; GCN-NEXT: %zext:_(<2 x s32>) = G_AND %low_bits, [[BUILD_VECTOR]]
     ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(<2 x s32>)
     %var:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %cFFFFF:_(s32) = G_CONSTANT i32 1048575
@@ -164,9 +165,8 @@ body: |
     ; GCN-NEXT: %c7FFF:_(s64) = G_CONSTANT i64 32767
     ; GCN-NEXT: %c:_(<2 x s64>) = G_BUILD_VECTOR %c3FFF(s64), %c7FFF(s64)
     ; GCN-NEXT: %low_bits:_(<2 x s64>) = G_AND %var, %c
-    ; GCN-NEXT: %trunc:_(<2 x s16>) = G_TRUNC %low_bits(<2 x s64>)
-    ; GCN-NEXT: %zext:_(<2 x s32>) = G_ZEXT %trunc(<2 x s16>)
-    ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(<2 x s32>)
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC %low_bits(<2 x s64>)
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[TRUNC]](<2 x s32>)
     %var:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %c3FFF:_(s64) = G_CONSTANT i64 16383
     %c7FFF:_(s64) = G_CONSTANT i64 32767
@@ -176,7 +176,6 @@ body: |
     %zext:_(<2 x s32>) = G_ZEXT %trunc(<2 x s16>)
     $vgpr0_vgpr1 = COPY %zext(<2 x s32>)
 ...
-
 ---
 name: zext_trunc_v2s32_v2s16_v2s64
 tracksRegLiveness: true
@@ -192,8 +191,7 @@ body: |
     ; GCN-NEXT: %c7FFF:_(s32) = G_CONSTANT i32 32767
     ; GCN-NEXT: %c:_(<2 x s32>) = G_BUILD_VECTOR %c3FFF(s32), %c7FFF(s32)
     ; GCN-NEXT: %low_bits:_(<2 x s32>) = G_AND %var, %c
-    ; GCN-NEXT: %trunc:_(<2 x s16>) = G_TRUNC %low_bits(<2 x s32>)
-    ; GCN-NEXT: %zext:_(<2 x s64>) = G_ZEXT %trunc(<2 x s16>)
+    ; GCN-NEXT: %zext:_(<2 x s64>) = G_ZEXT %low_bits(<2 x s32>)
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %zext(<2 x s64>)
     %var:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %c3FFF:_(s32) = G_CONSTANT i32 16383
@@ -204,3 +202,87 @@ body: |
     %zext:_(<2 x s64>) = G_ZEXT %trunc(<2 x s16>)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %zext(<2 x s64>)
 ...
+---
+name: zext_trunc_s32_s16_s64_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: zext_trunc_s32_s16_s64_2
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %var:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GCN-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND %var, [[C]]
+    ; GCN-NEXT: %zext:_(s64) = G_ZEXT [[AND]](s32)
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64)
+    %var:_(s32) = COPY $vgpr0
+    %trunc:_(s16) = G_TRUNC %var(s32)
+    %zext:_(s64) = G_ZEXT %trunc(s16)
+    $vgpr0_vgpr1 = COPY %zext(s64)
+...
+---
+name: zext_trunc_s64_s16_s64_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: zext_trunc_s64_s16_s64_2
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; GCN-NEXT: %zext:_(s64) = G_AND %var, [[C]]
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64)
+    %var:_(s64) = COPY $vgpr0_vgpr1
+    %trunc:_(s16) = G_TRUNC %var(s64)
+    %zext:_(s64) = G_ZEXT %trunc(s16)
+    $vgpr0_vgpr1 = COPY %zext(s64)
+...
+---
+name: zext_trunc_s64_s16_s32_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: zext_trunc_s64_s16_s32_2
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %var(s64)
+    ; GCN-NEXT: %zext:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
+    %var:_(s64) = COPY $vgpr0_vgpr1
+    %trunc:_(s16) = G_TRUNC %var(s64)
+    %zext:_(s32) = G_ZEXT %trunc(s16)
+    $vgpr0 = COPY %zext(s32)
+...
+---
+name: zext_trunc_s64_s16_s32_2_multi_use
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: zext_trunc_s64_s16_s32_2_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %var(s64)
+    ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+    ; GCN-NEXT: %zext2:_(s32) = G_ZEXT %trunc(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
+    ; GCN-NEXT: $vgpr0 = COPY %zext2(s32)
+    %var:_(s64) = COPY $vgpr0_vgpr1
+    %trunc:_(s16) = G_TRUNC %var(s64)
+    %zext:_(s32) = G_ZEXT %trunc(s16)
+    %zext2:_(s32) = G_ZEXT %trunc(s16)
+    $vgpr0 = COPY %zext(s32)
+    $vgpr0 = COPY %zext2(s32)
+...
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index bb7bc0447aea04..d39c751030aae6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -28,6 +28,9 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch(ptr addrspace(1) %out, i3
 ; GFX10-LABEL: divergent_i1_phi_uniform_branch:
 ; GFX10:       ; %bb.0: ; %A
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
@@ -65,16 +68,19 @@ exit:
 define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %out, i32 %tid, i32 inreg %cond) {
 ; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
 ; GFX10:       ; %bb.0: ; %A
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, 6, v2
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    s_andn2_b32 s0, s1, exec_lo
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
-; GFX10-NEXT:    s_or_b32 s1, s0, s1
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:  .LBB1_2: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index f52b7c635a66f1..3674f0b4f54e5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -135,6 +135,9 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB4_2
 ; CHECK-NEXT:  .LBB4_1: ; %bb12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -156,6 +159,9 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:  .LBB4_4: ; %bb8
 ; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB4_1
 ; CHECK-NEXT:  ; %bb.5: ; %bb11
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3bd3486ec261d4..44d2c21f203f2a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -350,11 +350,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX8-LABEL: s_fshl_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_and_b32 s3, s2, 7
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s3, s2, 7
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
@@ -364,11 +365,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX9-LABEL: s_fshl_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_and_b32 s3, s2, 7
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_and_b32 s3, s2, 7
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
@@ -381,8 +383,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX10-NEXT:    s_and_b32 s3, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
@@ -395,8 +398,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 ; GFX11-NEXT:    s_and_b32 s3, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
@@ -700,24 +704,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX8-NEXT:    s_and_b32 s6, s2, 7
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 7
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
@@ -731,24 +737,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s4, s1, 8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX9-NEXT:    s_and_b32 s6, s2, 7
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s5, 7
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s5
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
@@ -761,25 +769,27 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-LABEL: s_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX10-NEXT:    s_and_b32 s5, s2, 7
-; GFX10-NEXT:    s_lshr_b32 s6, s2, 8
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s5
-; GFX10-NEXT:    s_and_b32 s5, s6, 7
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s6
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s6, s5, 7
 ; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s2, s3, s4
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -792,25 +802,27 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX11-LABEL: s_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX11-NEXT:    s_and_b32 s5, s2, 7
-; GFX11-NEXT:    s_lshr_b32 s6, s2, 8
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s6, s2, 7
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s5
-; GFX11-NEXT:    s_and_b32 s5, s6, 7
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s6
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX11-NEXT:    s_and_b32 s6, s5, 7
 ; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s5
-; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX11-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
@@ -1026,14 +1038,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX8-NEXT:    s_and_b32 s12, s2, 7
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX8-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
@@ -1042,31 +1055,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s9, 7
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX8-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s2, s10, 7
-; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s3, s7, 0xff
-; GFX8-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX8-NEXT:    s_lshl_b32 s2, s4, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 7
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s8
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s3
+; GFX8-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX8-NEXT:    s_andn2_b32 s5, 7, s11
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX8-NEXT:    s_lshr_b32 s4, s8, 1
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
@@ -1087,14 +1104,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s2, 24
 ; GFX9-NEXT:    s_and_b32 s12, s2, 7
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX9-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX9-NEXT:    s_and_b32 s12, 0xffff, s12
-; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
@@ -1103,31 +1121,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s9, 7
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX9-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX9-NEXT:    s_andn2_b32 s3, 7, s9
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
 ; GFX9-NEXT:    s_and_b32 s2, s10, 7
-; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_and_b32 s3, s7, 0xff
-; GFX9-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX9-NEXT:    s_lshl_b32 s2, s4, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX9-NEXT:    s_andn2_b32 s4, 7, s10
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    s_and_b32 s3, s11, 7
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s8
 ; GFX9-NEXT:    s_lshl_b32 s3, s5, s3
+; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX9-NEXT:    s_andn2_b32 s5, 7, s11
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_lshr_b32 s4, s8, 1
+; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
@@ -1148,57 +1170,62 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_and_b32 s11, s2, 7
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
-; GFX10-NEXT:    s_and_b32 s11, 0xffff, s11
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX10-NEXT:    s_and_b32 s12, s2, 7
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_and_b32 s2, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s9, 7
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s11
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s1, s9, 7
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
-; GFX10-NEXT:    s_and_b32 s9, 0xffff, s9
-; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX10-NEXT:    s_lshr_b32 s3, s6, s9
-; GFX10-NEXT:    s_and_b32 s6, s10, 7
-; GFX10-NEXT:    s_or_b32 s1, s1, s3
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s6
-; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX10-NEXT:    s_or_b32 s1, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, s7, 0xff
 ; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s3, s10, 7
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_and_b32 s7, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
-; GFX10-NEXT:    s_lshr_b32 s4, s4, s6
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s7
-; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_lshl_b32 s5, s5, s6
-; GFX10-NEXT:    s_lshr_b32 s2, s7, s2
-; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s8
+; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX10-NEXT:    s_and_b32 s4, s11, 7
+; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX10-NEXT:    s_andn2_b32 s7, 7, s11
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s6, s7
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_or_b32 s2, s5, s2
+; GFX10-NEXT:    s_or_b32 s3, s4, s5
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -1209,57 +1236,62 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_and_b32 s11, s2, 7
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
-; GFX11-NEXT:    s_and_b32 s11, 0xffff, s11
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX11-NEXT:    s_and_b32 s12, s2, 7
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX11-NEXT:    s_and_b32 s2, s6, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s9, 7
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX11-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s11
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX11-NEXT:    s_lshr_b32 s2, s2, s9
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_and_b32 s1, s9, 7
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshr_b32 s6, s6, 1
-; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
-; GFX11-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX11-NEXT:    s_lshr_b32 s3, s6, s9
-; GFX11-NEXT:    s_and_b32 s6, s10, 7
-; GFX11-NEXT:    s_or_b32 s1, s1, s3
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s6
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_and_b32 s2, s7, 0xff
 ; GFX11-NEXT:    s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_b32 s3, s10, 7
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_and_b32 s7, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT:    s_lshr_b32 s4, s4, s6
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
-; GFX11-NEXT:    s_lshr_b32 s7, s8, 1
 ; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_lshl_b32 s5, s5, s6
-; GFX11-NEXT:    s_lshr_b32 s2, s7, s2
-; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_lshr_b32 s2, s2, s6
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s8
+; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
+; GFX11-NEXT:    s_and_b32 s4, s11, 7
+; GFX11-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX11-NEXT:    s_and_not1_b32 s7, 7, s11
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_lshr_b32 s5, s6, s7
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-NEXT:    s_or_b32 s3, s4, s5
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
@@ -3335,11 +3367,12 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX8-LABEL: s_fshl_i16:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s3, s2, 15
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
@@ -3348,11 +3381,12 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX9-LABEL: s_fshl_i16:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s3, s2, 15
 ; GFX9-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, s2
@@ -3361,11 +3395,12 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX10-LABEL: s_fshl_i16:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_and_b32 s3, s2, 15
 ; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
@@ -3374,11 +3409,12 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX11-LABEL: s_fshl_i16:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_b32 s3, s2, 15
 ; GFX11-NEXT:    s_and_not1_b32 s2, 15, s2
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
@@ -3782,10 +3818,11 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX8-LABEL: v_fshl_i16_vss:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s2, s1, 15
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s2, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3794,10 +3831,11 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX9-LABEL: v_fshl_i16_vss:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_and_b32 s2, s1, 15
 ; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, s2, v0
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3806,11 +3844,12 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX10-LABEL: v_fshl_i16_vss:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    s_and_b32 s2, s1, 15
 ; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    v_lshlrev_b16 v0, s2, v0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX10-NEXT:    v_lshlrev_b16 v0, s2, v0
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -3818,11 +3857,12 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX11-LABEL: v_fshl_i16_vss:
 ; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    s_and_b32 s2, s1, 15
 ; GFX11-NEXT:    s_and_not1_b32 s1, 15, s1
-; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    v_lshlrev_b16 v0, s2, v0
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX11-NEXT:    v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
@@ -3853,8 +3893,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -3862,22 +3902,25 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-LABEL: s_fshl_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    s_and_b32 s6, s2, 15
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 15
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
-; GFX8-NEXT:    s_lshr_b32 s3, s4, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
@@ -4148,14 +4191,15 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s0
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v2, v2, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -4340,19 +4384,22 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-LABEL: v_fshl_v2i16_vss:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_and_b32 s4, s1, 15
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s2
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
@@ -4451,42 +4498,46 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_bfe_u32 s3, s5, 0xf0001
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v3i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX8-NEXT:    s_and_b32 s9, s4, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s8, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
-; GFX8-NEXT:    s_lshr_b32 s6, s7, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
+; GFX8-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s5, 15
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
@@ -4805,13 +4856,13 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_bfe_u32 s4, s7, 0xf0001
 ; GFX6-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
-; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s2
-; GFX6-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -4819,41 +4870,47 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-LABEL: s_fshl_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_and_b32 s12, s4, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s12
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s10, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s10
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
-; GFX8-NEXT:    s_lshr_b32 s6, s8, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s8
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s10
+; GFX8-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
+; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s5, 15
-; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s9
 ; GFX8-NEXT:    s_or_b32 s1, s1, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s11
+; GFX8-NEXT:    s_lshr_b32 s5, s5, 1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_lshr_b32 s5, s9, 1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s3, s7, s3
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
@@ -5831,9 +5888,11 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
 ; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_and_b32 s11, s18, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_and_b32 s9, s9, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    s_mov_b32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -5854,11 +5913,13 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_and_b32 s10, s15, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_and_b32 s8, s16, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -5878,9 +5939,11 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
 ; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_and_b32 s11, s18, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_and_b32 s9, s9, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    s_mov_b32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -5901,11 +5964,13 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_and_b32 s10, s15, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_and_b32 s8, s16, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -5925,9 +5990,11 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], s8
 ; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_and_b32 s11, s18, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_and_b32 s9, s9, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    s_mov_b32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -5948,11 +6015,13 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_and_b32 s10, s15, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_and_b32 s8, s16, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -5961,21 +6030,23 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-LABEL: s_fshl_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b32 s9, s8, 0x7f
-; GFX10-NEXT:    s_mov_b32 s10, 0
-; GFX10-NEXT:    s_sub_i32 s11, s9, 64
-; GFX10-NEXT:    s_sub_i32 s12, 64, s9
+; GFX10-NEXT:    s_sub_i32 s16, s9, 64
+; GFX10-NEXT:    s_sub_i32 s10, 64, s9
 ; GFX10-NEXT:    s_cmp_lt_u32 s9, 64
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
-; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s8
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[0:1], s8
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_mov_b32 s10, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
@@ -5984,45 +6055,50 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_andn2_b32 s6, 0x7f, s8
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_not_b32 s10, s8
-; GFX10-NEXT:    s_sub_i32 s12, s6, 64
+; GFX10-NEXT:    s_sub_i32 s14, s6, 64
 ; GFX10-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s10
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_and_b32 s8, s15, 1
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_and_b32 s6, s16, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_and_b32 s9, s8, 0x7f
-; GFX11-NEXT:    s_mov_b32 s10, 0
-; GFX11-NEXT:    s_sub_i32 s11, s9, 64
-; GFX11-NEXT:    s_sub_i32 s12, 64, s9
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s16, s9, 64
+; GFX11-NEXT:    s_sub_i32 s10, 64, s9
 ; GFX11-NEXT:    s_cmp_lt_u32 s9, 64
-; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s12
-; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b64 s[14:15], s[16:17], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s8
+; GFX11-NEXT:    s_and_b32 s17, s17, 1
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[0:1], s8
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_and_b32 s9, s9, 1
+; GFX11-NEXT:    s_mov_b32 s10, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
@@ -6031,24 +6107,27 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_and_not1_b32 s6, 0x7f, s8
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX11-NEXT:    s_not_b32 s10, s8
-; GFX11-NEXT:    s_sub_i32 s12, s6, 64
+; GFX11-NEXT:    s_sub_i32 s14, s6, 64
 ; GFX11-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s10
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_and_b32 s8, s15, 1
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_and_b32 s6, s16, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6575,9 +6654,11 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_and_b32 s10, s13, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_and_b32 s5, s5, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
@@ -6627,9 +6708,11 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_and_b32 s10, s13, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
@@ -6679,9 +6762,11 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s4
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_and_b32 s10, s13, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
 ; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -6730,12 +6815,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s4
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_and_b32 s12, s13, 1
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_andn2_b32 s0, 0x7f, s4
@@ -6781,16 +6868,18 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s4
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_and_b32 s12, s13, 1
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_not1_b32 s0, 0x7f, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s1, 64, s0
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
@@ -6867,11 +6956,13 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_and_b32 s8, s11, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_and_b32 s6, s12, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
@@ -6922,11 +7013,13 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_and_b32 s8, s11, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_and_b32 s6, s12, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
@@ -6977,11 +7070,13 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_and_b32 s8, s11, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_and_b32 s6, s12, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
@@ -7025,20 +7120,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[2:3], s7
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_and_b32 s6, s11, 1
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_and_b32 s4, s12, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
@@ -7079,19 +7176,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], s7
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX11-NEXT:    s_and_b32 s6, s11, 1
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_and_b32 s4, s12, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
@@ -7238,9 +7338,11 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
 ; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_and_b32 s19, s28, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT:    s_and_b32 s17, s17, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_mov_b32 s18, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -7261,11 +7363,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
 ; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_and_b32 s19, s26, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_and_b32 s16, s27, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
@@ -7281,10 +7385,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_and_b32 s16, s21, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX6-NEXT:    s_and_b32 s10, s22, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX6-NEXT:    s_lshl_b32 s19, s14, 31
@@ -7303,11 +7409,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
 ; GFX6-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_and_b32 s16, s19, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_and_b32 s14, s20, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
@@ -7327,9 +7435,11 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
 ; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_and_b32 s19, s28, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT:    s_and_b32 s17, s17, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_mov_b32 s18, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -7350,11 +7460,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
 ; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_and_b32 s19, s26, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_and_b32 s16, s27, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
@@ -7370,10 +7482,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_and_b32 s16, s21, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX8-NEXT:    s_and_b32 s10, s22, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX8-NEXT:    s_lshl_b32 s19, s14, 31
@@ -7392,11 +7506,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
 ; GFX8-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_and_b32 s16, s19, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX8-NEXT:    s_and_b32 s14, s20, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
@@ -7416,9 +7532,11 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], s16
 ; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_and_b32 s19, s28, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT:    s_and_b32 s17, s17, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_mov_b32 s18, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -7439,11 +7557,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[24:25], s[8:9], s21
 ; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s19
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_and_b32 s19, s26, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_and_b32 s16, s27, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
@@ -7459,10 +7579,12 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], s20
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_and_b32 s16, s21, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX9-NEXT:    s_and_b32 s10, s22, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX9-NEXT:    s_lshl_b32 s19, s14, 31
@@ -7481,11 +7603,13 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[10:11], s16
 ; GFX9-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_and_b32 s16, s19, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[14:15], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX9-NEXT:    s_and_b32 s14, s20, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], 0
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
@@ -7494,21 +7618,23 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-LABEL: s_fshl_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b32 s17, s16, 0x7f
-; GFX10-NEXT:    s_mov_b32 s18, 0
-; GFX10-NEXT:    s_sub_i32 s19, s17, 64
-; GFX10-NEXT:    s_sub_i32 s21, 64, s17
+; GFX10-NEXT:    s_sub_i32 s21, s17, 64
+; GFX10-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s21
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s16
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s16
-; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[2:3], s16
+; GFX10-NEXT:    s_and_b32 s26, s26, 1
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[0:1], s16
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_mov_b32 s18, 0
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
@@ -7525,17 +7651,19 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s19
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[8:9], s19
+; GFX10-NEXT:    s_lshr_b64 s[24:25], s[8:9], s19
 ; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT:    s_and_b32 s16, s26, 1
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_and_b32 s10, s27, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[24:25], 0
 ; GFX10-NEXT:    s_and_b32 s10, s20, 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_sub_i32 s19, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s8, 64, s10
@@ -7545,13 +7673,15 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s20
+; GFX10-NEXT:    s_and_b32 s21, s21, 1
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s20
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10-NEXT:    s_and_b32 s8, s22, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX10-NEXT:    s_lshl_b32 s19, s14, 31
@@ -7569,12 +7699,14 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_and_b32 s14, s19, 1
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_and_b32 s12, s20, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
@@ -7583,21 +7715,24 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-LABEL: s_fshl_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_and_b32 s17, s16, 0x7f
-; GFX11-NEXT:    s_mov_b32 s18, 0
-; GFX11-NEXT:    s_sub_i32 s19, s17, 64
-; GFX11-NEXT:    s_sub_i32 s21, 64, s17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s21, s17, 64
+; GFX11-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s21
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s16
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s16
-; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[2:3], s16
+; GFX11-NEXT:    s_and_b32 s26, s26, 1
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[0:1], s16
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX11-NEXT:    s_and_b32 s17, s17, 1
+; GFX11-NEXT:    s_mov_b32 s18, 0
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
@@ -7614,17 +7749,20 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s19
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[8:9], s19
+; GFX11-NEXT:    s_lshr_b64 s[24:25], s[8:9], s19
 ; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX11-NEXT:    s_and_b32 s16, s26, 1
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s21
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_and_b32 s10, s27, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[24:25], 0
 ; GFX11-NEXT:    s_and_b32 s10, s20, 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX11-NEXT:    s_sub_i32 s19, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s8, 64, s10
@@ -7634,13 +7772,16 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s20
+; GFX11-NEXT:    s_and_b32 s21, s21, 1
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[4:5], s20
 ; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s19
 ; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX11-NEXT:    s_and_b32 s8, s22, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX11-NEXT:    s_lshl_b32 s19, s14, 31
@@ -7658,12 +7799,15 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_and_b32 s14, s19, 1
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-NEXT:    s_and_b32 s12, s20, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 58304d2072d7f6..dd85d370227e7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -1066,10 +1066,11 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX8-NEXT:    s_and_b32 s4, s11, 7
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s8
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_lshr_b32 s4, s8, s4
+; GFX8-NEXT:    s_lshr_b32 s4, s5, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
@@ -1127,10 +1128,11 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
 ; GFX9-NEXT:    s_and_b32 s4, s11, 7
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s5, 0xffff, s8
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_lshr_b32 s4, s8, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s5, s4
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
@@ -1181,13 +1183,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX10-NEXT:    s_lshr_b32 s3, s4, s6
-; GFX10-NEXT:    s_lshl_b32 s4, s5, 1
-; GFX10-NEXT:    s_andn2_b32 s5, 7, s11
+; GFX10-NEXT:    s_andn2_b32 s4, 7, s11
 ; GFX10-NEXT:    s_and_b32 s6, s11, 7
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s8
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
 ; GFX10-NEXT:    s_or_b32 s2, s2, s3
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_or_b32 s3, s4, s5
@@ -1242,13 +1245,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_lshl_b32 s2, s3, s2
 ; GFX11-NEXT:    s_lshr_b32 s3, s4, s6
-; GFX11-NEXT:    s_lshl_b32 s4, s5, 1
-; GFX11-NEXT:    s_and_not1_b32 s5, 7, s11
+; GFX11-NEXT:    s_and_not1_b32 s4, 7, s11
 ; GFX11-NEXT:    s_and_b32 s6, s11, 7
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s8
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshl_b32 s4, s4, s5
-; GFX11-NEXT:    s_lshr_b32 s5, s8, s6
+; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_lshr_b32 s5, s7, s6
 ; GFX11-NEXT:    s_or_b32 s2, s2, s3
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-NEXT:    s_or_b32 s3, s4, s5
@@ -3610,8 +3614,8 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, s3
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -3624,17 +3628,19 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_lshr_b32 s5, s5, 15
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
-; GFX8-NEXT:    s_lshr_b32 s5, s4, 15
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshr_b32 s5, s5, 15
 ; GFX8-NEXT:    s_xor_b32 s2, s2, -1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    s_and_b32 s6, s2, 15
 ; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
-; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
@@ -3642,10 +3648,11 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_and_b32 s1, s5, 15
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
@@ -3959,12 +3966,13 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v2, v2, s0
-; GFX8-NEXT:    s_lshr_b32 s4, s3, 15
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX8-NEXT:    s_lshr_b32 s4, s4, 15
 ; GFX8-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s3
@@ -4201,28 +4209,31 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 1, v0
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 15
-; GFX8-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    s_lshr_b32 s3, s2, 15
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 15
 ; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_or_b32_e32 v0, s3, v0
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_and_b32 s4, s1, 15
 ; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s4, v1
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s0, v0
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s2
+; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -4331,12 +4342,12 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s5
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v3i16:
@@ -4347,17 +4358,19 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s8, 15
 ; GFX8-NEXT:    s_or_b32 s0, s0, s8
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
-; GFX8-NEXT:    s_lshr_b32 s8, s7, 15
+; GFX8-NEXT:    s_and_b32 s8, 0xffff, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
+; GFX8-NEXT:    s_lshr_b32 s8, s8, 15
 ; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX8-NEXT:    s_and_b32 s9, s4, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
@@ -4365,24 +4378,26 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_and_b32 s2, s8, 15
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 15
-; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_xor_b32 s4, s5, -1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s5, s4, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
@@ -4737,8 +4752,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_and_b32 s5, 0xffff, s8
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
 ; GFX6-NEXT:    s_or_b32 s1, s1, s4
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_lshl_b32 s1, s2, 1
@@ -4769,8 +4784,8 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX6-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -4783,17 +4798,19 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s8, 15
 ; GFX8-NEXT:    s_or_b32 s0, s0, s8
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
-; GFX8-NEXT:    s_lshr_b32 s8, s7, 15
+; GFX8-NEXT:    s_and_b32 s8, 0xffff, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
+; GFX8-NEXT:    s_lshr_b32 s8, s8, 15
 ; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX8-NEXT:    s_and_b32 s9, s4, 15
 ; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
-; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
@@ -4801,10 +4818,11 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_and_b32 s2, s8, 15
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
@@ -4818,17 +4836,19 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, 15
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
-; GFX8-NEXT:    s_lshr_b32 s6, s4, 15
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
+; GFX8-NEXT:    s_lshr_b32 s6, s6, 15
 ; GFX8-NEXT:    s_xor_b32 s5, s5, -1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s5, 16
 ; GFX8-NEXT:    s_and_b32 s7, s5, 15
 ; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
-; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s7
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s5
@@ -4836,10 +4856,11 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_and_b32 s3, s6, 15
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT:    s_andn2_b32 s5, 15, s6
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s3
@@ -5857,10 +5878,12 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
 ; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_and_b32 s9, s17, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_and_b32 s9, s18, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
 ; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s14, s0, 64
@@ -5874,11 +5897,13 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_and_b32 s12, s15, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_and_b32 s8, s16, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
@@ -5904,10 +5929,12 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
 ; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
-; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX8-NEXT:    s_and_b32 s9, s17, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_and_b32 s9, s18, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
 ; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s14, s0, 64
@@ -5921,11 +5948,13 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_and_b32 s12, s15, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_and_b32 s8, s16, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
@@ -5951,10 +5980,12 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
 ; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
-; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_and_b32 s9, s17, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_and_b32 s9, s18, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
 ; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s14, s0, 64
@@ -5968,11 +5999,13 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_and_b32 s12, s15, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_and_b32 s8, s16, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
@@ -5995,12 +6028,14 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
 ; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s14
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
 ; GFX10-NEXT:    s_lshl_b64 s[14:15], s[0:1], s14
 ; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s8, 0x7f
@@ -6014,12 +6049,14 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_and_b32 s10, s15, 1
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_and_b32 s6, s16, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -6042,16 +6079,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s10
 ; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s14
+; GFX11-NEXT:    s_and_b32 s17, s17, 1
 ; GFX11-NEXT:    s_lshl_b64 s[14:15], s[0:1], s14
 ; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s16
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_and_b32 s9, s9, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s14, s0, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s0
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
@@ -6062,12 +6101,15 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX11-NEXT:    s_and_b32 s10, s15, 1
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_and_b32 s6, s16, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
@@ -6606,10 +6648,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_and_b32 s5, s13, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_and_b32 s5, s14, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s1, s0, 64
@@ -6660,10 +6704,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_and_b32 s5, s13, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_and_b32 s5, s14, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s1, s0, 64
@@ -6714,10 +6760,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_and_b32 s5, s13, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_and_b32 s5, s14, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s1, s0, 64
@@ -6765,12 +6813,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s10
+; GFX10-NEXT:    s_and_b32 s13, s13, 1
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s4, 0x7f
@@ -6819,16 +6869,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s10
+; GFX11-NEXT:    s_and_b32 s13, s13, 1
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s4, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s1, 64, s0
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
@@ -6903,11 +6955,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_and_b32 s8, s11, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX6-NEXT:    s_and_b32 s4, s12, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
@@ -6956,11 +7010,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_and_b32 s8, s11, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_and_b32 s4, s12, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
@@ -7009,11 +7065,13 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_and_b32 s8, s11, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_and_b32 s4, s12, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
@@ -7055,20 +7113,22 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT:    s_and_b32 s8, s11, 1
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_and_b32 s6, s12, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
@@ -7107,19 +7167,22 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT:    s_and_b32 s8, s11, 1
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_and_b32 s6, s12, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
@@ -7277,10 +7340,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
 ; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_and_b32 s17, s28, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_and_b32 s17, s29, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
 ; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s21, s0, 64
@@ -7294,11 +7359,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
 ; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_and_b32 s21, s26, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_and_b32 s16, s27, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
@@ -7319,10 +7386,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_and_b32 s16, s19, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_and_b32 s10, s21, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s18, s4, 64
@@ -7336,11 +7405,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_and_b32 s16, s19, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_and_b32 s14, s21, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
@@ -7366,10 +7437,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
 ; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_and_b32 s17, s28, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_and_b32 s17, s29, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
 ; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s21, s0, 64
@@ -7383,11 +7456,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
 ; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_and_b32 s21, s26, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_and_b32 s16, s27, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
@@ -7408,10 +7483,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_and_b32 s16, s19, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_and_b32 s10, s21, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s18, s4, 64
@@ -7425,11 +7502,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_and_b32 s16, s19, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_and_b32 s14, s21, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
@@ -7455,10 +7534,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
 ; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_and_b32 s17, s28, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
 ; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_and_b32 s17, s29, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
 ; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s21, s0, 64
@@ -7472,11 +7553,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
 ; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_and_b32 s21, s26, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_and_b32 s16, s27, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
@@ -7497,10 +7580,12 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_and_b32 s16, s19, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_and_b32 s10, s21, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s18, s4, 64
@@ -7514,11 +7599,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_and_b32 s16, s19, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_and_b32 s14, s21, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
@@ -7542,11 +7629,13 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
 ; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX10-NEXT:    s_and_b32 s18, s28, 1
 ; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
@@ -7558,12 +7647,14 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
 ; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_and_b32 s21, s21, 1
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_and_b32 s10, s26, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
@@ -7583,13 +7674,15 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s9
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s16
+; GFX10-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
 ; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX10-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_and_b32 s8, s21, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_and_b32 s4, s20, 0x7f
 ; GFX10-NEXT:    s_sub_i32 s18, s4, 64
@@ -7600,12 +7693,14 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s20
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX10-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[14:15], s20
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_and_b32 s8, s21, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
@@ -7631,15 +7726,17 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
 ; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
 ; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX11-NEXT:    s_and_b32 s18, s28, 1
 ; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_and_b32 s17, s17, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s18, s0, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
@@ -7648,12 +7745,15 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
 ; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_and_b32 s21, s21, 1
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_and_b32 s10, s26, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
@@ -7673,16 +7773,18 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s9
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s16
+; GFX11-NEXT:    s_and_b32 s19, s19, 1
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
 ; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX11-NEXT:    s_cselect_b64 s[10:11], s[16:17], 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_and_b32 s8, s21, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s18, s4, 64
 ; GFX11-NEXT:    s_sub_i32 s8, 64, s4
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
@@ -7691,12 +7793,15 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s20
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[14:15], s8
+; GFX11-NEXT:    s_and_b32 s19, s19, 1
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[14:15], s20
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s18
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_and_b32 s8, s21, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9443b39dcdc033..adc2f208a625ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -14,13 +14,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
+; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
-; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX8V4-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8V4-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
+; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V4-NEXT:    s_mov_b32 s6, s1
 ; GFX8V4-NEXT:    s_mov_b32 s7, s2
-; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX8V4-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
@@ -39,12 +45,18 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
+; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V5-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
-; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX8V5-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8V5-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX8V5-NEXT:    s_mov_b32 s2, s1
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V5-NEXT:    s_and_b32 s0, s0, 1
+; GFX8V5-NEXT:    s_mov_b32 s2, s1
+; GFX8V5-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
@@ -64,11 +76,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V4-NEXT:    s_mov_b32 s2, s0
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V4-NEXT:    s_mov_b32 s2, s0
+; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V4-NEXT:    s_and_b32 s0, s0, 1
+; GFX9V4-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9V4-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V4-NEXT:    s_mov_b32 s4, s1
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V4-NEXT:    s_and_b32 s0, s0, 1
+; GFX9V4-NEXT:    s_mov_b32 s4, s1
+; GFX9V4-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
@@ -88,11 +106,17 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9V5-NEXT:    s_mov_b32 s2, s0
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V5-NEXT:    s_mov_b32 s2, s0
+; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V5-NEXT:    s_and_b32 s0, s0, 1
+; GFX9V5-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9V5-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9V5-NEXT:    s_mov_b32 s4, s1
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V5-NEXT:    s_and_b32 s0, s0, 1
+; GFX9V5-NEXT:    s_mov_b32 s4, s1
+; GFX9V5-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
@@ -120,6 +144,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -133,6 +158,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V5-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -145,6 +171,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V4-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -157,6 +184,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V5-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
@@ -176,6 +204,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V4-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -189,6 +218,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V5-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v0
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -201,6 +231,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX9V4-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V4-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -213,6 +244,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_eq_u32 s1, s3
 ; GFX9V5-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V5-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
index d3bc661f5940b6..b0a0d266f6636b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll
@@ -8,22 +8,25 @@
 define amdgpu_ps void @_amdgpu_ps_main(i1 %arg) {
 ; CHECK-LABEL: _amdgpu_ps_main:
 ; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s10, -1
 ; CHECK-NEXT:    s_mov_b32 s5, s4
 ; CHECK-NEXT:    s_mov_b32 s6, s4
 ; CHECK-NEXT:    s_mov_b32 s7, s4
-; CHECK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; CHECK-NEXT:    s_buffer_load_dword s1, s[4:7], 0x0
-; CHECK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; CHECK-NEXT:    s_mov_b32 s10, -1
 ; CHECK-NEXT:    s_mov_b32 s11, 0x31c16000
 ; CHECK-NEXT:    s_add_u32 s8, s8, s0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    s_buffer_load_dword s0, s[4:7], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    s_addc_u32 s9, s9, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_cmp_ge_i32 s1, 0
+; CHECK-NEXT:    s_cmp_ge_i32 s0, 0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: ; %bb12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 41e915a4c1011b..1b7b05b847cfb4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -32,14 +32,41 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
 }
 
 define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i64:
-; GFX:       ; %bb.0:
-; GFX-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX-NEXT:    s_add_u32 s0, s0, s2
-; GFX-NEXT:    s_mov_b32 s3, s2
-; GFX-NEXT:    s_addc_u32 s1, s1, s2
-; GFX-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX6-NEXT:    s_add_u32 s0, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_mov_b32 s3, s2
+; GFX6-NEXT:    s_addc_u32 s1, s1, s2
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX8-NEXT:    s_mov_b32 s3, s2
+; GFX8-NEXT:    s_addc_u32 s1, s1, s2
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s4, s3, 1
+; GFX10-NEXT:    s_mov_b32 s3, s2
+; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10-NEXT:    s_addc_u32 s1, s1, s2
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    ; return to shader part epilog
   %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
   ret i64 %res
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 96cab200b61cdb..3e9f3e1a78a51b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -90,6 +90,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -114,6 +117,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -138,6 +144,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -162,6 +171,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -185,6 +197,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB11_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -211,6 +226,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -234,6 +252,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB13_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -260,6 +281,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -285,6 +309,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB15_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -316,6 +343,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -343,6 +373,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -374,6 +407,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -402,6 +438,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; CHECK-NEXT:    s_cmp_le_i32 s0, 22
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB19_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index a18f843440445c..c50a9142d3111a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -93,6 +93,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -117,6 +120,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB8_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -141,6 +147,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -165,6 +174,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB10_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -188,6 +200,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB11_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -214,6 +229,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB12_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -237,6 +255,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB13_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -263,6 +284,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB14_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -288,6 +312,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB15_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -319,6 +346,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB16_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -346,6 +376,9 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -377,6 +410,9 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
 ; CHECK-NEXT:    s_and_b32 s0, 1, s0
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB18_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 6415e185446f53..0aca5230ad0546 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -8,6 +8,9 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX10-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GFX10-NEXT:  ; %bb.1: ; %mid
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -28,6 +31,9 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GFX11-NEXT:  ; %bb.1: ; %mid
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
index 06393857352b3a..66b1b3210c81ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_and_b32 s0, s0, 1
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %mid
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..f2ee2aaa29c43e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -69,15 +69,36 @@ define i8 @v_lshr_i8_7(i8 %value) {
 }
 
 define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
-; GCN-LABEL: s_lshr_i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i8:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, %amount
@@ -85,14 +106,30 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
 }
 
 define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
-; GCN-LABEL: s_lshr_i8_7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x10007
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8_7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x10007
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8_7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x10007
+; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, 7
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, 7
   ret i8 %result
@@ -1761,11 +1798,13 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
+; GCN-NEXT:    s_and_b32 s7, s11, 1
+; GCN-NEXT:    s_cmp_lg_u32 s7, 0
 ; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
+; GCN-NEXT:    s_and_b32 s4, s12, 1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
+; GCN-NEXT:    s_cmp_lg_u32 s7, 0
 ; GCN-NEXT:    s_cselect_b32 s2, s6, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
@@ -1781,13 +1820,15 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s3
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT:    s_and_b32 s3, s11, 1
 ; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_and_b32 s6, s12, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, %amount
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 42f1bf84c04207..8b28ceb0f2544a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -729,12 +729,16 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX7-NEXT:    s_add_u32 s9, s13, s9
-; GFX7-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX7-NEXT:    s_addc_u32 s10, s14, s10
+; GFX7-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX7-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s7
+; GFX7-NEXT:    s_and_b32 s7, s13, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s11, s0
 ; GFX7-NEXT:    s_mul_i32 s1, s1, s6
-; GFX7-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX7-NEXT:    s_and_b32 s6, s12, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX7-NEXT:    s_mul_i32 s2, s2, s5
 ; GFX7-NEXT:    s_add_u32 s0, s2, s0
@@ -777,12 +781,16 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX8-NEXT:    s_add_u32 s9, s13, s9
-; GFX8-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX8-NEXT:    s_addc_u32 s10, s14, s10
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s7
+; GFX8-NEXT:    s_and_b32 s7, s13, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    s_addc_u32 s0, s11, s0
 ; GFX8-NEXT:    s_mul_i32 s1, s1, s6
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX8-NEXT:    s_and_b32 s6, s12, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX8-NEXT:    s_mul_i32 s2, s2, s5
 ; GFX8-NEXT:    s_add_u32 s0, s2, s0
@@ -814,12 +822,16 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
 ; GFX9-NEXT:    s_add_u32 s9, s13, s9
-; GFX9-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX9-NEXT:    s_addc_u32 s10, s14, s10
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_mul_i32 s8, s0, s4
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s7
+; GFX9-NEXT:    s_and_b32 s7, s13, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    s_addc_u32 s0, s11, s0
 ; GFX9-NEXT:    s_mul_i32 s1, s1, s6
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_and_b32 s6, s12, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_addc_u32 s0, s0, s1
 ; GFX9-NEXT:    s_mul_i32 s2, s2, s5
 ; GFX9-NEXT:    s_add_u32 s0, s2, s0
@@ -851,19 +863,23 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX10PLUS-NEXT:    s_mul_hi_u32 s13, s1, s4
 ; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10PLUS-NEXT:    s_add_u32 s8, s12, s8
-; GFX10PLUS-NEXT:    s_mul_i32 s12, s0, s7
-; GFX10PLUS-NEXT:    s_addc_u32 s7, s13, s9
-; GFX10PLUS-NEXT:    s_addc_u32 s9, s10, s12
+; GFX10PLUS-NEXT:    s_addc_u32 s9, s13, s9
+; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10PLUS-NEXT:    s_mul_i32 s7, s0, s7
+; GFX10PLUS-NEXT:    s_and_b32 s12, s12, 1
 ; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s6
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s5
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s9, s1
+; GFX10PLUS-NEXT:    s_addc_u32 s7, s10, s7
+; GFX10PLUS-NEXT:    s_and_b32 s10, s11, 1
 ; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s4
-; GFX10PLUS-NEXT:    s_add_i32 s1, s1, s2
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s4
+; GFX10PLUS-NEXT:    s_addc_u32 s1, s7, s1
+; GFX10PLUS-NEXT:    s_add_i32 s1, s1, s2
+; GFX10PLUS-NEXT:    s_mov_b32 s2, s9
 ; GFX10PLUS-NEXT:    s_add_i32 s3, s1, s3
 ; GFX10PLUS-NEXT:    s_mov_b32 s1, s8
-; GFX10PLUS-NEXT:    s_mov_b32 s2, s7
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i128:
@@ -887,19 +903,24 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ; GFX12-NEXT:    s_mul_hi_u32 s13, s1, s4
 ; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s8, s12, s8
-; GFX12-NEXT:    s_mul_i32 s12, s0, s7
-; GFX12-NEXT:    s_add_co_ci_u32 s7, s13, s9
-; GFX12-NEXT:    s_add_co_ci_u32 s9, s10, s12
+; GFX12-NEXT:    s_add_co_ci_u32 s9, s13, s9
+; GFX12-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX12-NEXT:    s_mul_i32 s7, s0, s7
+; GFX12-NEXT:    s_and_b32 s12, s12, 1
 ; GFX12-NEXT:    s_mul_i32 s1, s1, s6
-; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX12-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX12-NEXT:    s_mul_i32 s2, s2, s5
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s9, s1
+; GFX12-NEXT:    s_add_co_ci_u32 s7, s10, s7
+; GFX12-NEXT:    s_and_b32 s10, s11, 1
 ; GFX12-NEXT:    s_mul_i32 s3, s3, s4
-; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX12-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s4
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s7, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX12-NEXT:    s_mov_b32 s2, s9
 ; GFX12-NEXT:    s_add_co_i32 s3, s1, s3
 ; GFX12-NEXT:    s_mov_b32 s1, s8
-; GFX12-NEXT:    s_mov_b32 s2, s7
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i128 %num, %den
   %cast = bitcast i128 %result to <4 x i32>
@@ -1080,7 +1101,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX7-NEXT:    v_mul_hi_u32 v3, s16, v1
 ; GFX7-NEXT:    s_mul_i32 s18, s1, s8
-; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX7-NEXT:    s_add_u32 s18, s18, s17
 ; GFX7-NEXT:    s_addc_u32 s17, s23, s22
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s11
@@ -1091,33 +1112,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX7-NEXT:    v_readfirstlane_b32 s28, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-NEXT:    v_readfirstlane_b32 s27, v5
+; GFX7-NEXT:    v_readfirstlane_b32 s25, v5
 ; GFX7-NEXT:    v_mul_hi_u32 v5, v3, s9
-; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX7-NEXT:    s_add_u32 s24, s24, s22
-; GFX7-NEXT:    s_addc_u32 s23, s27, s23
+; GFX7-NEXT:    s_addc_u32 s23, s25, s23
 ; GFX7-NEXT:    v_readfirstlane_b32 s29, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v5, s8
-; GFX7-NEXT:    s_mul_i32 s27, s2, s10
+; GFX7-NEXT:    s_mul_i32 s25, s2, s10
 ; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX7-NEXT:    s_add_u32 s24, s27, s24
+; GFX7-NEXT:    s_add_u32 s24, s25, s24
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s10
-; GFX7-NEXT:    s_addc_u32 s27, s28, s23
+; GFX7-NEXT:    s_addc_u32 s25, s28, s23
 ; GFX7-NEXT:    s_mul_i32 s28, s3, s9
 ; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX7-NEXT:    s_add_u32 s28, s28, s24
 ; GFX7-NEXT:    v_readfirstlane_b32 s30, v6
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s16, v4
-; GFX7-NEXT:    s_addc_u32 s27, s29, s27
+; GFX7-NEXT:    s_addc_u32 s25, s29, s25
 ; GFX7-NEXT:    s_mul_i32 s29, s4, s8
 ; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX7-NEXT:    s_add_u32 s28, s29, s28
 ; GFX7-NEXT:    v_readfirstlane_b32 s33, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v2, s9
-; GFX7-NEXT:    s_addc_u32 s27, s30, s27
+; GFX7-NEXT:    s_addc_u32 s29, s30, s25
 ; GFX7-NEXT:    s_mul_i32 s30, s16, s11
-; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s31, v6
 ; GFX7-NEXT:    s_add_u32 s19, s30, s19
 ; GFX7-NEXT:    s_addc_u32 s28, s31, s28
@@ -1135,125 +1156,149 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX7-NEXT:    s_cselect_b32 s33, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    s_add_u32 s19, s34, s19
-; GFX7-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX7-NEXT:    s_addc_u32 s28, s35, s28
-; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX7-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX7-NEXT:    s_addc_u32 s19, s25, s19
+; GFX7-NEXT:    s_and_b32 s26, s26, 1
+; GFX7-NEXT:    s_and_b32 s27, s27, 1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s14
+; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
+; GFX7-NEXT:    s_addc_u32 s19, s26, s19
+; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX7-NEXT:    s_and_b32 s20, s20, 1
+; GFX7-NEXT:    s_and_b32 s21, s21, 1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s13
-; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
 ; GFX7-NEXT:    s_addc_u32 s20, s20, 0
-; GFX7-NEXT:    v_readfirstlane_b32 s26, v0
+; GFX7-NEXT:    s_and_b32 s21, s26, 1
+; GFX7-NEXT:    v_readfirstlane_b32 s27, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, s2, v1
-; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX7-NEXT:    s_addc_u32 s20, s20, s28
-; GFX7-NEXT:    s_mul_i32 s25, s16, s14
+; GFX7-NEXT:    s_mul_i32 s26, s16, s14
 ; GFX7-NEXT:    s_mul_i32 s28, s1, s13
 ; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s11
 ; GFX7-NEXT:    s_mul_i32 s28, s2, s12
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v5, s10
 ; GFX7-NEXT:    s_mul_i32 s28, s3, s11
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v0, s9
 ; GFX7-NEXT:    s_mul_i32 s28, s4, s10
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
 ; GFX7-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX7-NEXT:    v_mul_hi_u32 v6, v6, s8
 ; GFX7-NEXT:    s_mul_i32 s28, s5, s9
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
 ; GFX7-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX7-NEXT:    v_readfirstlane_b32 s36, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, s2, v4
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    s_mul_i32 s28, s6, s8
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX7-NEXT:    s_add_u32 s25, s28, s25
-; GFX7-NEXT:    s_addc_u32 s26, s35, s26
+; GFX7-NEXT:    s_add_u32 s26, s28, s26
+; GFX7-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX7-NEXT:    s_mul_i32 s28, s16, s13
 ; GFX7-NEXT:    v_readfirstlane_b32 s35, v2
-; GFX7-NEXT:    s_add_u32 s27, s28, s27
+; GFX7-NEXT:    s_add_u32 s28, s28, s29
 ; GFX7-NEXT:    v_readfirstlane_b32 s37, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v3, s10
-; GFX7-NEXT:    s_addc_u32 s25, s35, s25
+; GFX7-NEXT:    s_addc_u32 s26, s35, s26
 ; GFX7-NEXT:    s_mul_i32 s35, s1, s12
-; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s35, s27
-; GFX7-NEXT:    s_addc_u32 s25, s36, s25
+; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX7-NEXT:    s_add_u32 s28, s35, s28
+; GFX7-NEXT:    s_addc_u32 s26, s36, s26
 ; GFX7-NEXT:    s_mul_i32 s36, s2, s11
 ; GFX7-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s36, s27
+; GFX7-NEXT:    s_add_u32 s28, s36, s28
 ; GFX7-NEXT:    v_readfirstlane_b32 s38, v1
 ; GFX7-NEXT:    v_mul_hi_u32 v1, v5, s9
-; GFX7-NEXT:    s_addc_u32 s25, s37, s25
+; GFX7-NEXT:    s_addc_u32 s26, s37, s26
 ; GFX7-NEXT:    s_mul_i32 s37, s3, s10
 ; GFX7-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX7-NEXT:    s_add_u32 s27, s37, s27
+; GFX7-NEXT:    s_add_u32 s28, s37, s28
 ; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s8
-; GFX7-NEXT:    s_addc_u32 s25, s38, s25
+; GFX7-NEXT:    s_addc_u32 s26, s38, s26
 ; GFX7-NEXT:    s_mul_i32 s38, s4, s9
 ; GFX7-NEXT:    s_cselect_b32 s37, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s39, v1
-; GFX7-NEXT:    s_add_u32 s27, s38, s27
-; GFX7-NEXT:    s_addc_u32 s25, s39, s25
+; GFX7-NEXT:    s_add_u32 s28, s38, s28
+; GFX7-NEXT:    s_addc_u32 s26, s39, s26
 ; GFX7-NEXT:    s_mul_i32 s39, s5, s8
 ; GFX7-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX7-NEXT:    v_readfirstlane_b32 s40, v0
-; GFX7-NEXT:    s_add_u32 s27, s39, s27
-; GFX7-NEXT:    s_addc_u32 s25, s40, s25
+; GFX7-NEXT:    s_add_u32 s28, s39, s28
+; GFX7-NEXT:    s_addc_u32 s26, s40, s26
 ; GFX7-NEXT:    s_cselect_b32 s39, 1, 0
+; GFX7-NEXT:    s_and_b32 s30, s30, 1
+; GFX7-NEXT:    s_and_b32 s31, s31, 1
 ; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX7-NEXT:    s_and_b32 s31, s33, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX7-NEXT:    s_and_b32 s31, s34, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX7-NEXT:    s_addc_u32 s30, s30, 0
+; GFX7-NEXT:    s_and_b32 s21, s21, 1
 ; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX7-NEXT:    s_addc_u32 s21, s30, s27
-; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX7-NEXT:    s_addc_u32 s21, s30, s28
+; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX7-NEXT:    s_and_b32 s22, s22, 1
+; GFX7-NEXT:    s_and_b32 s23, s23, 1
 ; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX7-NEXT:    s_and_b32 s23, s24, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX7-NEXT:    s_and_b32 s23, s25, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX7-NEXT:    s_addc_u32 s22, s22, 0
-; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX7-NEXT:    s_addc_u32 s22, s22, s25
+; GFX7-NEXT:    s_and_b32 s23, s28, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX7-NEXT:    s_addc_u32 s22, s22, s26
+; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX7-NEXT:    s_mul_i32 s16, s16, s15
-; GFX7-NEXT:    s_addc_u32 s15, s26, s16
+; GFX7-NEXT:    s_and_b32 s15, s23, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX7-NEXT:    s_addc_u32 s15, s27, s16
 ; GFX7-NEXT:    s_mul_i32 s1, s1, s14
-; GFX7-NEXT:    s_cmp_lg_u32 s39, 0
+; GFX7-NEXT:    s_and_b32 s14, s39, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s15, s1
 ; GFX7-NEXT:    s_mul_i32 s2, s2, s13
-; GFX7-NEXT:    s_cmp_lg_u32 s38, 0
+; GFX7-NEXT:    s_and_b32 s13, s38, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s2
+; GFX7-NEXT:    s_and_b32 s2, s37, 1
 ; GFX7-NEXT:    s_mul_i32 s3, s3, s12
-; GFX7-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s3
+; GFX7-NEXT:    s_and_b32 s2, s36, 1
 ; GFX7-NEXT:    s_mul_i32 s4, s4, s11
-; GFX7-NEXT:    s_cmp_lg_u32 s36, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s4
+; GFX7-NEXT:    s_and_b32 s2, s35, 1
 ; GFX7-NEXT:    s_mul_i32 s5, s5, s10
-; GFX7-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_and_b32 s2, s29, 1
 ; GFX7-NEXT:    s_mul_i32 s6, s6, s9
-; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX7-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX7-NEXT:    s_mul_i32 s0, s0, s8
@@ -1301,7 +1346,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s16, v1
 ; GFX8-NEXT:    s_mul_i32 s18, s1, s8
-; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_add_u32 s18, s18, s17
 ; GFX8-NEXT:    s_addc_u32 s17, s23, s22
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s11
@@ -1312,33 +1357,33 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_mul_i32 s24, s1, s11
 ; GFX8-NEXT:    v_readfirstlane_b32 s28, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s27, v5
+; GFX8-NEXT:    v_readfirstlane_b32 s25, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, s9
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX8-NEXT:    s_add_u32 s24, s24, s22
-; GFX8-NEXT:    s_addc_u32 s23, s27, s23
+; GFX8-NEXT:    s_addc_u32 s23, s25, s23
 ; GFX8-NEXT:    v_readfirstlane_b32 s29, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v5, s8
-; GFX8-NEXT:    s_mul_i32 s27, s2, s10
+; GFX8-NEXT:    s_mul_i32 s25, s2, s10
 ; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
-; GFX8-NEXT:    s_add_u32 s24, s27, s24
+; GFX8-NEXT:    s_add_u32 s24, s25, s24
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s10
-; GFX8-NEXT:    s_addc_u32 s27, s28, s23
+; GFX8-NEXT:    s_addc_u32 s25, s28, s23
 ; GFX8-NEXT:    s_mul_i32 s28, s3, s9
 ; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX8-NEXT:    s_add_u32 s28, s28, s24
 ; GFX8-NEXT:    v_readfirstlane_b32 s30, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s16, v4
-; GFX8-NEXT:    s_addc_u32 s27, s29, s27
+; GFX8-NEXT:    s_addc_u32 s25, s29, s25
 ; GFX8-NEXT:    s_mul_i32 s29, s4, s8
 ; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_add_u32 s28, s29, s28
 ; GFX8-NEXT:    v_readfirstlane_b32 s33, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v2, s9
-; GFX8-NEXT:    s_addc_u32 s27, s30, s27
+; GFX8-NEXT:    s_addc_u32 s29, s30, s25
 ; GFX8-NEXT:    s_mul_i32 s30, s16, s11
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s31, v6
 ; GFX8-NEXT:    s_add_u32 s19, s30, s19
 ; GFX8-NEXT:    s_addc_u32 s28, s31, s28
@@ -1356,125 +1401,149 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX8-NEXT:    s_cselect_b32 s33, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    s_add_u32 s19, s34, s19
-; GFX8-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX8-NEXT:    s_addc_u32 s28, s35, s28
-; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
 ; GFX8-NEXT:    s_cselect_b32 s34, 1, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_addc_u32 s19, s25, s19
+; GFX8-NEXT:    s_and_b32 s26, s26, 1
+; GFX8-NEXT:    s_and_b32 s27, s27, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
+; GFX8-NEXT:    s_addc_u32 s19, s26, s19
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_and_b32 s20, s20, 1
+; GFX8-NEXT:    s_and_b32 s21, s21, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s13
-; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
 ; GFX8-NEXT:    s_addc_u32 s20, s20, 0
-; GFX8-NEXT:    v_readfirstlane_b32 s26, v0
+; GFX8-NEXT:    s_and_b32 s21, s26, 1
+; GFX8-NEXT:    v_readfirstlane_b32 s27, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v1
-; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_addc_u32 s20, s20, s28
-; GFX8-NEXT:    s_mul_i32 s25, s16, s14
+; GFX8-NEXT:    s_mul_i32 s26, s16, s14
 ; GFX8-NEXT:    s_mul_i32 s28, s1, s13
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s11
 ; GFX8-NEXT:    s_mul_i32 s28, s2, s12
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v5, s10
 ; GFX8-NEXT:    s_mul_i32 s28, s3, s11
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v0, s9
 ; GFX8-NEXT:    s_mul_i32 s28, s4, s10
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v6, s8
 ; GFX8-NEXT:    s_mul_i32 s28, s5, s9
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX8-NEXT:    v_readfirstlane_b32 s36, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v4
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    s_mul_i32 s28, s6, s8
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
-; GFX8-NEXT:    s_add_u32 s25, s28, s25
-; GFX8-NEXT:    s_addc_u32 s26, s35, s26
+; GFX8-NEXT:    s_add_u32 s26, s28, s26
+; GFX8-NEXT:    s_addc_u32 s27, s35, s27
 ; GFX8-NEXT:    s_mul_i32 s28, s16, s13
 ; GFX8-NEXT:    v_readfirstlane_b32 s35, v2
-; GFX8-NEXT:    s_add_u32 s27, s28, s27
+; GFX8-NEXT:    s_add_u32 s28, s28, s29
 ; GFX8-NEXT:    v_readfirstlane_b32 s37, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v3, s10
-; GFX8-NEXT:    s_addc_u32 s25, s35, s25
+; GFX8-NEXT:    s_addc_u32 s26, s35, s26
 ; GFX8-NEXT:    s_mul_i32 s35, s1, s12
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s35, s27
-; GFX8-NEXT:    s_addc_u32 s25, s36, s25
+; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX8-NEXT:    s_add_u32 s28, s35, s28
+; GFX8-NEXT:    s_addc_u32 s26, s36, s26
 ; GFX8-NEXT:    s_mul_i32 s36, s2, s11
 ; GFX8-NEXT:    s_cselect_b32 s35, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s36, s27
+; GFX8-NEXT:    s_add_u32 s28, s36, s28
 ; GFX8-NEXT:    v_readfirstlane_b32 s38, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v5, s9
-; GFX8-NEXT:    s_addc_u32 s25, s37, s25
+; GFX8-NEXT:    s_addc_u32 s26, s37, s26
 ; GFX8-NEXT:    s_mul_i32 s37, s3, s10
 ; GFX8-NEXT:    s_cselect_b32 s36, 1, 0
-; GFX8-NEXT:    s_add_u32 s27, s37, s27
+; GFX8-NEXT:    s_add_u32 s28, s37, s28
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s8
-; GFX8-NEXT:    s_addc_u32 s25, s38, s25
+; GFX8-NEXT:    s_addc_u32 s26, s38, s26
 ; GFX8-NEXT:    s_mul_i32 s38, s4, s9
 ; GFX8-NEXT:    s_cselect_b32 s37, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s39, v1
-; GFX8-NEXT:    s_add_u32 s27, s38, s27
-; GFX8-NEXT:    s_addc_u32 s25, s39, s25
+; GFX8-NEXT:    s_add_u32 s28, s38, s28
+; GFX8-NEXT:    s_addc_u32 s26, s39, s26
 ; GFX8-NEXT:    s_mul_i32 s39, s5, s8
 ; GFX8-NEXT:    s_cselect_b32 s38, 1, 0
 ; GFX8-NEXT:    v_readfirstlane_b32 s40, v0
-; GFX8-NEXT:    s_add_u32 s27, s39, s27
-; GFX8-NEXT:    s_addc_u32 s25, s40, s25
+; GFX8-NEXT:    s_add_u32 s28, s39, s28
+; GFX8-NEXT:    s_addc_u32 s26, s40, s26
 ; GFX8-NEXT:    s_cselect_b32 s39, 1, 0
+; GFX8-NEXT:    s_and_b32 s30, s30, 1
+; GFX8-NEXT:    s_and_b32 s31, s31, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX8-NEXT:    s_and_b32 s31, s33, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX8-NEXT:    s_and_b32 s31, s34, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX8-NEXT:    s_addc_u32 s30, s30, 0
+; GFX8-NEXT:    s_and_b32 s21, s21, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_addc_u32 s21, s30, s27
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_addc_u32 s21, s30, s28
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_and_b32 s22, s22, 1
+; GFX8-NEXT:    s_and_b32 s23, s23, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_and_b32 s23, s24, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_and_b32 s23, s25, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_addc_u32 s22, s22, 0
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_addc_u32 s22, s22, s25
+; GFX8-NEXT:    s_and_b32 s23, s28, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX8-NEXT:    s_addc_u32 s22, s22, s26
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
 ; GFX8-NEXT:    s_mul_i32 s16, s16, s15
-; GFX8-NEXT:    s_addc_u32 s15, s26, s16
+; GFX8-NEXT:    s_and_b32 s15, s23, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_addc_u32 s15, s27, s16
 ; GFX8-NEXT:    s_mul_i32 s1, s1, s14
-; GFX8-NEXT:    s_cmp_lg_u32 s39, 0
+; GFX8-NEXT:    s_and_b32 s14, s39, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s15, s1
 ; GFX8-NEXT:    s_mul_i32 s2, s2, s13
-; GFX8-NEXT:    s_cmp_lg_u32 s38, 0
+; GFX8-NEXT:    s_and_b32 s13, s38, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s2
+; GFX8-NEXT:    s_and_b32 s2, s37, 1
 ; GFX8-NEXT:    s_mul_i32 s3, s3, s12
-; GFX8-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s2, s36, 1
 ; GFX8-NEXT:    s_mul_i32 s4, s4, s11
-; GFX8-NEXT:    s_cmp_lg_u32 s36, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s2, s35, 1
 ; GFX8-NEXT:    s_mul_i32 s5, s5, s10
-; GFX8-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_and_b32 s2, s29, 1
 ; GFX8-NEXT:    s_mul_i32 s6, s6, s9
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s8
@@ -1555,12 +1624,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s19, s34, s19
 ; GFX9-NEXT:    s_addc_u32 s24, s35, s24
 ; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
+; GFX9-NEXT:    s_and_b32 s22, s22, 1
+; GFX9-NEXT:    s_and_b32 s23, s23, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_addc_u32 s19, s22, s19
 ; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_and_b32 s20, s20, 1
+; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_addc_u32 s20, s20, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX9-NEXT:    s_and_b32 s21, s22, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_addc_u32 s20, s20, s24
 ; GFX9-NEXT:    s_mul_i32 s22, s16, s14
 ; GFX9-NEXT:    s_mul_i32 s24, s1, s13
@@ -1619,42 +1693,61 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX9-NEXT:    s_add_u32 s24, s39, s24
 ; GFX9-NEXT:    s_addc_u32 s22, s40, s22
 ; GFX9-NEXT:    s_cselect_b32 s39, 1, 0
+; GFX9-NEXT:    s_and_b32 s30, s30, 1
+; GFX9-NEXT:    s_and_b32 s31, s31, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX9-NEXT:    s_addc_u32 s30, s30, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX9-NEXT:    s_and_b32 s31, s33, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX9-NEXT:    s_addc_u32 s30, s30, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX9-NEXT:    s_and_b32 s31, s34, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s31, 0
 ; GFX9-NEXT:    s_addc_u32 s30, s30, 0
+; GFX9-NEXT:    s_and_b32 s21, s21, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_addc_u32 s21, s30, s24
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_and_b32 s26, s26, 1
+; GFX9-NEXT:    s_and_b32 s27, s27, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX9-NEXT:    s_addc_u32 s26, s26, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_and_b32 s27, s28, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX9-NEXT:    s_addc_u32 s26, s26, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_and_b32 s27, s29, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
 ; GFX9-NEXT:    s_addc_u32 s26, s26, 0
+; GFX9-NEXT:    s_and_b32 s24, s24, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_addc_u32 s22, s26, s22
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_mul_i32 s16, s16, s15
+; GFX9-NEXT:    s_and_b32 s15, s24, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX9-NEXT:    s_addc_u32 s15, s23, s16
 ; GFX9-NEXT:    s_mul_i32 s1, s1, s14
-; GFX9-NEXT:    s_cmp_lg_u32 s39, 0
+; GFX9-NEXT:    s_and_b32 s14, s39, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s15, s1
 ; GFX9-NEXT:    s_mul_i32 s2, s2, s13
-; GFX9-NEXT:    s_cmp_lg_u32 s38, 0
+; GFX9-NEXT:    s_and_b32 s13, s38, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s2
+; GFX9-NEXT:    s_and_b32 s2, s37, 1
 ; GFX9-NEXT:    s_mul_i32 s3, s3, s12
-; GFX9-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    s_and_b32 s2, s36, 1
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s11
-; GFX9-NEXT:    s_cmp_lg_u32 s36, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s4
+; GFX9-NEXT:    s_and_b32 s2, s35, 1
 ; GFX9-NEXT:    s_mul_i32 s5, s5, s10
-; GFX9-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_and_b32 s2, s25, 1
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s9
-; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s8
@@ -1685,17 +1778,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s9
 ; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10PLUS-NEXT:    s_add_u32 s16, s21, s16
-; GFX10PLUS-NEXT:    s_addc_u32 s17, s22, s17
-; GFX10PLUS-NEXT:    s_mul_i32 s22, s1, s8
+; GFX10PLUS-NEXT:    s_addc_u32 s21, s22, s17
+; GFX10PLUS-NEXT:    s_mul_i32 s17, s1, s8
 ; GFX10PLUS-NEXT:    s_mul_hi_u32 s23, s1, s8
-; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10PLUS-NEXT:    s_add_u32 s16, s22, s16
-; GFX10PLUS-NEXT:    s_addc_u32 s17, s23, s17
+; GFX10PLUS-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10PLUS-NEXT:    s_add_u32 s17, s17, s16
+; GFX10PLUS-NEXT:    s_addc_u32 s16, s23, s21
 ; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s12
 ; GFX10PLUS-NEXT:    s_mul_i32 s25, s1, s11
 ; GFX10PLUS-NEXT:    s_mul_hi_u32 s24, s0, s12
 ; GFX10PLUS-NEXT:    s_mul_hi_u32 s26, s1, s11
-; GFX10PLUS-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX10PLUS-NEXT:    s_add_u32 s23, s25, s23
 ; GFX10PLUS-NEXT:    s_addc_u32 s24, s26, s24
 ; GFX10PLUS-NEXT:    s_mul_i32 s26, s2, s10
@@ -1734,15 +1827,21 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX10PLUS-NEXT:    s_add_u32 s18, s33, s18
 ; GFX10PLUS-NEXT:    s_addc_u32 s23, s34, s23
 ; GFX10PLUS-NEXT:    s_cselect_b32 s33, 1, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s14
-; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
+; GFX10PLUS-NEXT:    s_and_b32 s21, s21, 1
+; GFX10PLUS-NEXT:    s_and_b32 s22, s22, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX10PLUS-NEXT:    s_addc_u32 s18, s22, s18
 ; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s20, s20, 1
+; GFX10PLUS-NEXT:    s_and_b32 s19, s19, 1
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s14
 ; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10PLUS-NEXT:    s_and_b32 s20, s21, 1
 ; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s14
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10PLUS-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, s23
 ; GFX10PLUS-NEXT:    s_mul_i32 s23, s1, s13
 ; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
@@ -1773,7 +1872,6 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX10PLUS-NEXT:    s_add_u32 s23, s23, s24
 ; GFX10PLUS-NEXT:    s_addc_u32 s21, s34, s21
 ; GFX10PLUS-NEXT:    s_mul_i32 s34, s1, s12
-; GFX10PLUS-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX10PLUS-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10PLUS-NEXT:    s_add_u32 s23, s34, s23
 ; GFX10PLUS-NEXT:    s_addc_u32 s21, s35, s21
@@ -1798,52 +1896,71 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX10PLUS-NEXT:    s_add_u32 s23, s38, s23
 ; GFX10PLUS-NEXT:    s_addc_u32 s21, s39, s21
 ; GFX10PLUS-NEXT:    s_cselect_b32 s38, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s30, s30, 1
+; GFX10PLUS-NEXT:    s_and_b32 s29, s29, 1
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s14
+; GFX10PLUS-NEXT:    s_mul_i32 s15, s0, s15
 ; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s31, 0
+; GFX10PLUS-NEXT:    s_and_b32 s30, s31, 1
+; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s14
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
 ; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s13
 ; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX10PLUS-NEXT:    s_and_b32 s30, s33, 1
 ; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s12
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX10PLUS-NEXT:    s_mul_i32 s4, s4, s11
 ; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
+; GFX10PLUS-NEXT:    s_and_b32 s20, s20, 1
+; GFX10PLUS-NEXT:    s_mul_i32 s5, s5, s10
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s4, s4, s11
+; GFX10PLUS-NEXT:    s_mul_i32 s6, s6, s9
 ; GFX10PLUS-NEXT:    s_addc_u32 s20, s29, s23
 ; GFX10PLUS-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s26, s26, 1
+; GFX10PLUS-NEXT:    s_and_b32 s25, s25, 1
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s26, s0, s15
+; GFX10PLUS-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s5, s5, s10
+; GFX10PLUS-NEXT:    s_and_b32 s26, s27, 1
+; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s8
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s6, s6, s9
+; GFX10PLUS-NEXT:    s_and_b32 s26, s28, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
+; GFX10PLUS-NEXT:    s_and_b32 s23, s23, 1
 ; GFX10PLUS-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s7, s7, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s15, s25, s21
-; GFX10PLUS-NEXT:    s_addc_u32 s21, s22, s26
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s38, 0
-; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s8
-; GFX10PLUS-NEXT:    s_addc_u32 s1, s21, s1
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX10PLUS-NEXT:    s_addc_u32 s21, s25, s21
+; GFX10PLUS-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s23, s23, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10PLUS-NEXT:    s_addc_u32 s15, s22, s15
+; GFX10PLUS-NEXT:    s_and_b32 s22, s38, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10PLUS-NEXT:    s_addc_u32 s1, s15, s1
+; GFX10PLUS-NEXT:    s_and_b32 s14, s37, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s2
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s36, 0
-; GFX10PLUS-NEXT:    s_mov_b32 s2, s17
+; GFX10PLUS-NEXT:    s_and_b32 s2, s36, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s35, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s3, s18
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s4
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s34, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s4, s19
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s24, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s5, s20
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s2, s16
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s6
-; GFX10PLUS-NEXT:    s_mov_b32 s6, s15
+; GFX10PLUS-NEXT:    s_mov_b32 s6, s21
 ; GFX10PLUS-NEXT:    s_add_i32 s7, s1, s7
-; GFX10PLUS-NEXT:    s_mov_b32 s1, s16
+; GFX10PLUS-NEXT:    s_mov_b32 s1, s17
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i256:
@@ -1864,17 +1981,17 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s9
 ; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s16, s21, s16
-; GFX12-NEXT:    s_add_co_ci_u32 s17, s22, s17
-; GFX12-NEXT:    s_mul_i32 s22, s1, s8
+; GFX12-NEXT:    s_add_co_ci_u32 s21, s22, s17
+; GFX12-NEXT:    s_mul_i32 s17, s1, s8
 ; GFX12-NEXT:    s_mul_hi_u32 s23, s1, s8
-; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX12-NEXT:    s_add_co_u32 s16, s22, s16
-; GFX12-NEXT:    s_add_co_ci_u32 s17, s23, s17
+; GFX12-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX12-NEXT:    s_add_co_u32 s17, s17, s16
+; GFX12-NEXT:    s_add_co_ci_u32 s16, s23, s21
 ; GFX12-NEXT:    s_mul_i32 s23, s0, s12
 ; GFX12-NEXT:    s_mul_i32 s25, s1, s11
 ; GFX12-NEXT:    s_mul_hi_u32 s24, s0, s12
 ; GFX12-NEXT:    s_mul_hi_u32 s26, s1, s11
-; GFX12-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s23, s25, s23
 ; GFX12-NEXT:    s_add_co_ci_u32 s24, s26, s24
 ; GFX12-NEXT:    s_mul_i32 s26, s2, s10
@@ -1913,15 +2030,21 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_u32 s18, s33, s18
 ; GFX12-NEXT:    s_add_co_ci_u32 s23, s34, s23
 ; GFX12-NEXT:    s_cselect_b32 s33, 1, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s14
-; GFX12-NEXT:    s_add_co_ci_u32 s18, s21, s18
+; GFX12-NEXT:    s_and_b32 s21, s21, 1
+; GFX12-NEXT:    s_and_b32 s22, s22, 1
+; GFX12-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX12-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX12-NEXT:    s_add_co_ci_u32 s18, s22, s18
 ; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX12-NEXT:    s_and_b32 s20, s20, 1
+; GFX12-NEXT:    s_and_b32 s19, s19, 1
 ; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX12-NEXT:    s_mul_hi_u32 s34, s1, s13
+; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s14
 ; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX12-NEXT:    s_and_b32 s20, s21, 1
 ; GFX12-NEXT:    s_mul_i32 s21, s0, s14
+; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX12-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, s23
 ; GFX12-NEXT:    s_mul_i32 s23, s1, s13
 ; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
@@ -1952,7 +2075,6 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_u32 s23, s23, s24
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s34, s21
 ; GFX12-NEXT:    s_mul_i32 s34, s1, s12
-; GFX12-NEXT:    s_mul_hi_u32 s35, s1, s12
 ; GFX12-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX12-NEXT:    s_add_co_u32 s23, s34, s23
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s35, s21
@@ -1977,52 +2099,75 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ; GFX12-NEXT:    s_add_co_u32 s23, s38, s23
 ; GFX12-NEXT:    s_add_co_ci_u32 s21, s39, s21
 ; GFX12-NEXT:    s_cselect_b32 s38, 1, 0
+; GFX12-NEXT:    s_and_b32 s30, s30, 1
+; GFX12-NEXT:    s_and_b32 s29, s29, 1
 ; GFX12-NEXT:    s_cmp_lg_u32 s30, 0
-; GFX12-NEXT:    s_mul_i32 s1, s1, s14
+; GFX12-NEXT:    s_mul_i32 s15, s0, s15
 ; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s31, 0
+; GFX12-NEXT:    s_and_b32 s30, s31, 1
+; GFX12-NEXT:    s_mul_i32 s1, s1, s14
+; GFX12-NEXT:    s_cmp_lg_u32 s30, 0
 ; GFX12-NEXT:    s_mul_i32 s2, s2, s13
 ; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s33, 0
+; GFX12-NEXT:    s_and_b32 s30, s33, 1
 ; GFX12-NEXT:    s_mul_i32 s3, s3, s12
+; GFX12-NEXT:    s_cmp_lg_u32 s30, 0
+; GFX12-NEXT:    s_mul_i32 s4, s4, s11
 ; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
+; GFX12-NEXT:    s_and_b32 s20, s20, 1
+; GFX12-NEXT:    s_mul_i32 s5, s5, s10
 ; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX12-NEXT:    s_mul_i32 s4, s4, s11
+; GFX12-NEXT:    s_mul_i32 s6, s6, s9
 ; GFX12-NEXT:    s_add_co_ci_u32 s20, s29, s23
 ; GFX12-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX12-NEXT:    s_and_b32 s26, s26, 1
+; GFX12-NEXT:    s_and_b32 s25, s25, 1
 ; GFX12-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX12-NEXT:    s_mul_i32 s26, s0, s15
+; GFX12-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX12-NEXT:    s_mul_i32 s5, s5, s10
+; GFX12-NEXT:    s_and_b32 s26, s27, 1
+; GFX12-NEXT:    s_mul_i32 s0, s0, s8
+; GFX12-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
-; GFX12-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX12-NEXT:    s_mul_i32 s6, s6, s9
+; GFX12-NEXT:    s_and_b32 s26, s28, 1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
+; GFX12-NEXT:    s_and_b32 s23, s23, 1
 ; GFX12-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX12-NEXT:    s_mul_i32 s7, s7, s8
-; GFX12-NEXT:    s_add_co_ci_u32 s15, s25, s21
-; GFX12-NEXT:    s_add_co_ci_u32 s21, s22, s26
-; GFX12-NEXT:    s_cmp_lg_u32 s38, 0
-; GFX12-NEXT:    s_mul_i32 s0, s0, s8
-; GFX12-NEXT:    s_add_co_ci_u32 s1, s21, s1
-; GFX12-NEXT:    s_cmp_lg_u32 s37, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s21, s25, s21
+; GFX12-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s23, s23, 1
+; GFX12-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s15, s22, s15
+; GFX12-NEXT:    s_and_b32 s22, s38, 1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s15, s1
+; GFX12-NEXT:    s_and_b32 s14, s37, 1
+; GFX12-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s2
-; GFX12-NEXT:    s_cmp_lg_u32 s36, 0
-; GFX12-NEXT:    s_mov_b32 s2, s17
+; GFX12-NEXT:    s_and_b32 s2, s36, 1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s3
-; GFX12-NEXT:    s_cmp_lg_u32 s35, 0
+; GFX12-NEXT:    s_and_b32 s2, s35, 1
 ; GFX12-NEXT:    s_mov_b32 s3, s18
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s4
-; GFX12-NEXT:    s_cmp_lg_u32 s34, 0
+; GFX12-NEXT:    s_and_b32 s2, s34, 1
 ; GFX12-NEXT:    s_mov_b32 s4, s19
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s5
-; GFX12-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX12-NEXT:    s_and_b32 s2, s24, 1
 ; GFX12-NEXT:    s_mov_b32 s5, s20
+; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-NEXT:    s_mov_b32 s2, s16
 ; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s6
-; GFX12-NEXT:    s_mov_b32 s6, s15
+; GFX12-NEXT:    s_mov_b32 s6, s21
 ; GFX12-NEXT:    s_add_co_i32 s7, s1, s7
-; GFX12-NEXT:    s_mov_b32 s1, s16
+; GFX12-NEXT:    s_mov_b32 s1, s17
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i256 %num, %den
   %cast = bitcast i256 %result to <8 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index c7afbeabbbb6b1..fbf8c4a5faf8e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -20,11 +20,17 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_movk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    s_and_b32 s4, s4, 1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    s_and_b32 s4, s4, 1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.1
 ; GCN-NEXT:    s_load_dword s5, s[6:7], 0x10
@@ -91,6 +97,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_movk_i32 s32, 0x1000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
+; GCN-NEXT:    s_and_b32 s4, s4, 1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 168e6dfa5f147d..7e7d9286612419 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4186,6 +4186,9 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i48:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s4, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s5, s5, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s1, s3
 ; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -4211,6 +4214,9 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-LABEL: s_saddsat_i48:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s1, s3
 ; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -4238,6 +4244,9 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_add_u32 s4, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4262,15 +4271,18 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10-NEXT:    s_add_u32 s4, s0, s2
-; GFX10-NEXT:    s_addc_u32 s5, s1, s3
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
-; GFX10-NEXT:    s_xor_b32 s0, s1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
+; GFX10-NEXT:    s_ashr_i32 s1, s5, 31
+; GFX10-NEXT:    s_add_i32 s3, s1, 0x80000000
+; GFX10-NEXT:    s_xor_b32 s0, s2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4282,14 +4294,17 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX11-NEXT:    s_add_u32 s4, s0, s2
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
-; GFX11-NEXT:    s_xor_b32 s0, s1, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
+; GFX11-NEXT:    s_ashr_i32 s1, s5, 31
+; GFX11-NEXT:    s_add_i32 s3, s1, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s0, s2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4566,6 +4581,9 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s4, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s5, s5, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4587,6 +4605,9 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-LABEL: s_saddsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4608,6 +4629,9 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-LABEL: s_saddsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4629,8 +4653,11 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-LABEL: s_saddsat_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s4, s0, s2
-; GFX10-NEXT:    s_addc_u32 s5, s1, s3
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
@@ -4646,6 +4673,9 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX11-LABEL: s_saddsat_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_add_u32 s4, s0, s2
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s1, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
@@ -4929,6 +4959,9 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_saddsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s8, s0, s4
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_and_b32 s9, s9, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4936,14 +4969,17 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT:    s_add_u32 s0, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX6-NEXT:    s_add_u32 s0, s2, s6
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -4967,6 +5003,9 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-LABEL: s_saddsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s8, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_and_b32 s9, s9, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4974,14 +5013,17 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT:    s_add_u32 s0, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT:    s_add_u32 s0, s2, s6
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -5005,6 +5047,9 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-LABEL: s_saddsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s8, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_and_b32 s9, s9, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -5012,14 +5057,17 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    s_add_u32 s0, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX9-NEXT:    s_add_u32 s0, s2, s6
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -5043,8 +5091,11 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-LABEL: s_saddsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s8, s0, s4
-; GFX10-NEXT:    s_addc_u32 s9, s1, s5
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
@@ -5052,14 +5103,17 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX10-NEXT:    s_add_u32 s0, s2, s6
-; GFX10-NEXT:    s_addc_u32 s1, s3, s7
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
+; GFX10-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
 ; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
@@ -5073,6 +5127,9 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-LABEL: s_saddsat_v2i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_add_u32 s8, s0, s4
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_and_b32 s9, s9, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_addc_u32 s9, s1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
@@ -5081,13 +5138,16 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX11-NEXT:    s_add_u32 s0, s2, s6
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s1, s3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
 ; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
@@ -5105,10 +5165,19 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s4, s0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s5, s1, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s8, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_and_b32 s9, s9, 1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s9, s3, s7
@@ -5146,9 +5215,18 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_saddsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s4, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s5, s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s8, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_and_b32 s9, s9, 1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -5193,9 +5271,18 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_saddsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s4, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s5, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s8, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_and_b32 s9, s9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -5240,30 +5327,39 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_saddsat_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s4, s0, s4
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s10, s[6:7], 0
+; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s5, s1, s5
-; GFX10-NEXT:    s_addc_u32 s8, s2, s6
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
+; GFX10-NEXT:    s_and_b32 s8, s8, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_addc_u32 s8, s2, s6
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v3, s9
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[8:9], s[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX10-NEXT:    s_and_b32 s0, 1, s10
 ; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s0
 ; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5281,30 +5377,38 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-LABEL: s_saddsat_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_add_u32 s4, s0, s4
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s10, s[6:7], 0
+; GFX11-NEXT:    s_and_b32 s8, s8, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s1, s5
-; GFX11-NEXT:    s_addc_u32 s8, s2, s6
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
+; GFX11-NEXT:    s_and_b32 s8, s8, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX11-NEXT:    s_addc_u32 s8, s2, s6
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s9, s9, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_addc_u32 s9, s3, s7
 ; GFX11-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX11-NEXT:    v_mov_b32_e32 v3, s9
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[8:9], s[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX11-NEXT:    s_and_b32 s0, 1, s10
 ; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v1 :: v_dual_mov_b32 v2, s5
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s0
 ; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
-; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s8
@@ -5895,10 +5999,19 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_saddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s8, s0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_and_b32 s16, s16, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_addc_u32 s9, s1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_and_b32 s16, s16, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_addc_u32 s16, s2, s10
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_and_b32 s17, s17, 1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s17, s3, s11
@@ -5912,26 +6025,35 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
-; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
+; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_add_u32 s0, s4, s12
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_addc_u32 s1, s5, s13
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
+; GFX6-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX6-NEXT:    s_add_u32 s0, s4, s12
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_addc_u32 s1, s5, s13
+; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    s_addc_u32 s2, s6, s14
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_addc_u32 s3, s7, s15
@@ -5973,9 +6095,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_saddsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s8, s0, s8
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_and_b32 s16, s16, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_addc_u32 s9, s1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_and_b32 s16, s16, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_addc_u32 s16, s2, s10
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_and_b32 s17, s17, 1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -5996,25 +6127,34 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, s12
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s13
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s17
-; GFX8-NEXT:    s_add_u32 s0, s4, s12
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX8-NEXT:    s_addc_u32 s1, s5, s13
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_addc_u32 s2, s6, s14
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -6063,9 +6203,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_saddsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s8, s0, s8
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_and_b32 s16, s16, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_addc_u32 s9, s1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_and_b32 s16, s16, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_addc_u32 s16, s2, s10
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_and_b32 s17, s17, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -6086,25 +6235,34 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s4, s12
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s13
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
-; GFX9-NEXT:    s_add_u32 s0, s4, s12
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX9-NEXT:    s_addc_u32 s1, s5, s13
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_addc_u32 s2, s6, s14
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
@@ -6153,42 +6311,60 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-LABEL: s_saddsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_add_u32 s8, s0, s8
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_and_b32 s16, s16, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s1, s9
-; GFX10-NEXT:    s_addc_u32 s16, s2, s10
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT:    s_addc_u32 s17, s3, s11
+; GFX10-NEXT:    s_and_b32 s16, s16, 1
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s9
-; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    s_addc_u32 s16, s2, s10
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_addc_u32 s17, s3, s11
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s18
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX10-NEXT:    s_add_i32 s11, s10, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    s_and_b32 s1, 1, s0
 ; GFX10-NEXT:    s_add_u32 s0, s4, s12
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s1
 ; GFX10-NEXT:    s_addc_u32 s1, s5, s13
-; GFX10-NEXT:    s_addc_u32 s2, s6, s14
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT:    s_addc_u32 s3, s7, s15
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s1
+; GFX10-NEXT:    s_addc_u32 s2, s6, s14
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_addc_u32 s3, s7, s15
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
+; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[14:15], 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10-NEXT:    s_and_b32 s4, 1, s12
@@ -6231,40 +6407,59 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-LABEL: s_saddsat_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_add_u32 s8, s0, s8
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX11-NEXT:    s_and_b32 s16, s16, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_addc_u32 s9, s1, s9
-; GFX11-NEXT:    s_addc_u32 s16, s2, s10
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
+; GFX11-NEXT:    s_and_b32 s16, s16, 1
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_addc_u32 s16, s2, s10
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s17, s17, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_addc_u32 s17, s3, s11
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s18
 ; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
 ; GFX11-NEXT:    s_add_i32 s11, s10, 0x80000000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    s_and_b32 s1, 1, s0
 ; GFX11-NEXT:    s_add_u32 s0, s4, s12
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v5, s0
+; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s1
 ; GFX11-NEXT:    s_addc_u32 s1, s5, s13
-; GFX11-NEXT:    s_addc_u32 s2, s6, s14
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX11-NEXT:    s_addc_u32 s3, s7, s15
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    s_addc_u32 s2, s6, s14
+; GFX11-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX11-NEXT:    s_and_b32 s3, s3, 1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-NEXT:    s_addc_u32 s3, s7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[14:15], 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_and_b32 v0, 1, v0
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX11-NEXT:    s_and_b32 s4, 1, s12
 ; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 81abe91b283f96..1de2cca4c4c728 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -205,8 +205,14 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
 ; CHECK-NEXT:    s_ashr_i32 s8, s5, 31
 ; CHECK-NEXT:    s_add_u32 s0, s2, s6
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    s_addc_u32 s1, s3, s6
 ; CHECK-NEXT:    s_add_u32 s10, s4, s8
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_and_b32 s3, s3, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_addc_u32 s11, s5, s8
 ; CHECK-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
@@ -217,34 +223,37 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_sub_u32 s3, 0, s10
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s11
-; CHECK-NEXT:    s_xor_b64 s[6:7], s[6:7], s[8:9]
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; CHECK-NEXT:    s_subb_u32 s5, 0, s11
+; CHECK-NEXT:    s_xor_b64 s[6:7], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
 ; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
@@ -1179,9 +1188,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
@@ -1191,23 +1203,23 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v4
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
@@ -1273,33 +1285,36 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v13, v8
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
 ; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v13
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v14, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v16, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 7d7f450e590faa..f1af4db3282009 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -150,8 +150,14 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
 ; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
 ; GFX8-NEXT:    s_add_u32 s0, s8, s2
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s9, s2
 ; GFX8-NEXT:    s_add_u32 s8, s10, s12
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    s_mov_b32 s13, s12
 ; GFX8-NEXT:    s_addc_u32 s9, s11, s12
 ; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
@@ -163,7 +169,8 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_sub_u32 s14, 0, s8
-; GFX8-NEXT:    s_subb_u32 s15, 0, s9
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_and_b32 s15, s15, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
@@ -171,6 +178,8 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_subb_u32 s15, 0, s9
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
@@ -183,14 +192,14 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
@@ -310,8 +319,14 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    s_ashr_i32 s2, s13, 31
 ; GFX9-NEXT:    s_ashr_i32 s4, s15, 31
 ; GFX9-NEXT:    s_add_u32 s0, s12, s2
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s13, s2
 ; GFX9-NEXT:    s_add_u32 s6, s14, s4
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s15, s4
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
@@ -323,7 +338,8 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_u32 s14, 0, s6
-; GFX9-NEXT:    s_subb_u32 s15, 0, s7
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_and_b32 s15, s15, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -331,6 +347,8 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_subb_u32 s15, 0, s7
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
@@ -343,15 +361,15 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
@@ -464,17 +482,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    s_ashr_i32 s2, s13, 31
 ; GFX10-NEXT:    s_ashr_i32 s4, s15, 31
 ; GFX10-NEXT:    s_add_u32 s0, s12, s2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s13, s2
 ; GFX10-NEXT:    s_add_u32 s6, s14, s4
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_addc_u32 s7, s15, s4
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT:    s_addc_u32 s7, s15, s4
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX10-NEXT:    s_sub_u32 s12, 0, s6
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_and_b32 s13, s13, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_subb_u32 s13, 0, s7
+; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -484,11 +513,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s13, s12, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2]
-; GFX10-NEXT:    s_subb_u32 s13, 0, s7
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s14, s12, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s12, v4, v[1:2]
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v3, v0
@@ -1277,8 +1304,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    s_ashr_i32 s4, s13, 31
 ; GFX8-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX8-NEXT:    s_add_u32 s16, s12, s4
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_addc_u32 s17, s13, s4
 ; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_mov_b32 s7, s6
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
@@ -1290,7 +1323,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_sub_u32 s18, 0, s12
-; GFX8-NEXT:    s_subb_u32 s19, 0, s13
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_and_b32 s19, s19, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1298,6 +1332,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_subb_u32 s19, 0, s13
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
@@ -1310,14 +1346,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
@@ -1331,9 +1367,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    s_xor_b64 s[18:19], s[4:5], s[6:7]
-; GFX8-NEXT:    s_ashr_i32 s6, s15, 31
-; GFX8-NEXT:    s_mov_b32 s7, s6
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
@@ -1380,154 +1413,166 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s16, v0
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v4, v[1:2]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s13
-; GFX8-NEXT:    s_ashr_i32 s16, s3, 31
 ; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s17, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s12, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s12, v7
+; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v9
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v8
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
+; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s12, v2
+; GFX8-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
+; GFX8-NEXT:    s_ashr_i32 s6, s15, 31
+; GFX8-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT:    s_add_u32 s0, s14, s6
-; GFX8-NEXT:    s_addc_u32 s1, s15, s6
-; GFX8-NEXT:    s_add_u32 s2, s2, s16
-; GFX8-NEXT:    s_mov_b32 s17, s16
-; GFX8-NEXT:    s_addc_u32 s3, s3, s16
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s12, v8
-; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_add_u32 s14, s14, s6
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_and_b32 s0, s0, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_addc_u32 s15, s15, s6
+; GFX8-NEXT:    s_add_u32 s0, s2, s12
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT:    s_mov_b32 s13, s12
+; GFX8-NEXT:    s_addc_u32 s1, s3, s12
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc
-; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[0:1], s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v10, s3
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v11, s2
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[0:1]
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v10
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v11
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v3, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX8-NEXT:    s_mov_b32 s7, s6
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v11, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v11
+; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v0
+; GFX8-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
 ; GFX8-NEXT:    s_sub_u32 s5, 0, s2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[18:19], s5, v5, 0
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_and_b32 s20, s20, 1
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s5, v10, v[1:2]
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX8-NEXT:    s_subb_u32 s20, 0, s3
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v15, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v5, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v14, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s5, v5, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s20, v12, v[1:2]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v16, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v10, v0
+; GFX8-NEXT:    v_mul_lo_u32 v8, v5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT:    v_xor_b32_e32 v9, s19, v10
+; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, v10, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v1
+; GFX8-NEXT:    v_mul_hi_u32 v8, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_mul_hi_u32 v1, v5, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, v10, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, s18, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v5, 0
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v10, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s19
-; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s18, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v8, v[0:1]
+; GFX8-NEXT:    v_xor_b32_e32 v9, s17, v9
+; GFX8-NEXT:    v_mov_b32_e32 v10, s17
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v5, v[3:4]
+; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v11, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v8, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v11, v5, v3
+; GFX8-NEXT:    v_mul_lo_u32 v11, v8, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v9, v8, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v11, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT:    v_mul_hi_u32 v3, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NEXT:    v_mul_lo_u32 v7, s13, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s12, v3
+; GFX8-NEXT:    v_mul_lo_u32 v7, s15, v2
+; GFX8-NEXT:    v_mul_lo_u32 v8, s14, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v6, s12, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, s14, v2
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s13, v3
-; GFX8-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s15, v3
+; GFX8-NEXT:    v_mul_hi_u32 v2, s15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s12, v3
+; GFX8-NEXT:    v_mul_hi_u32 v8, s14, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, s13, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s15, v3
 ; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
 ; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s13
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s12, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, s15
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s14, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s13, v6
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s15, v6
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
@@ -1560,7 +1605,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v2, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[16:17]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v8
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s1
@@ -1588,8 +1633,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    s_ashr_i32 s4, s13, 31
 ; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX9-NEXT:    s_add_u32 s16, s12, s4
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_addc_u32 s17, s13, s4
 ; GFX9-NEXT:    s_add_u32 s0, s0, s6
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    s_mov_b32 s7, s6
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
@@ -1601,7 +1652,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_u32 s18, 0, s12
-; GFX9-NEXT:    s_subb_u32 s19, 0, s13
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_and_b32 s19, s19, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1609,6 +1661,8 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_subb_u32 s19, 0, s13
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
@@ -1621,20 +1675,21 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
@@ -1642,9 +1697,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_xor_b64 s[18:19], s[4:5], s[6:7]
-; GFX9-NEXT:    s_ashr_i32 s6, s15, 31
-; GFX9-NEXT:    s_mov_b32 s7, s6
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1683,159 +1735,170 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v5, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v4, v3, v0, v6
+; GFX9-NEXT:    v_add3_u32 v6, v3, v0, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v4, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s17
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, s16, v1
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s13, v5, v[2:3]
-; GFX9-NEXT:    s_ashr_i32 s16, s3, 31
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v6, v[0:1]
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s16, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s13, v5, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v1, s17, v2
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v8, v3, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, s17, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s12, v1
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s12, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v2, v13, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s14, s6
-; GFX9-NEXT:    s_addc_u32 s1, s15, s6
-; GFX9-NEXT:    s_add_u32 s2, s2, s16
-; GFX9-NEXT:    s_mov_b32 s17, s16
-; GFX9-NEXT:    s_addc_u32 s3, s3, s16
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v16, s2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v16
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s12, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v17, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v17
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v1
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s12, v8
+; GFX9-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
+; GFX9-NEXT:    s_ashr_i32 s6, s15, 31
+; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    s_add_u32 s14, s14, s6
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_addc_u32 s15, s15, s6
+; GFX9-NEXT:    s_add_u32 s0, s2, s12
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_addc_u32 s1, s3, s12
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v16, s3
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, 0x4f800000, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, v12, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v5, v10, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s7, s6
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v5
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v3
+; GFX9-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
 ; GFX9-NEXT:    s_sub_u32 s5, 0, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v3, v14, vcc
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v17
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[18:19], s5, v12, 0
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_and_b32 s20, s20, 1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s5, v13, v[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX9-NEXT:    s_subb_u32 s20, 0, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v4, v12, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[14:15], s20, v18, v[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v3, v14, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v16, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v18, v2
-; GFX9-NEXT:    v_mul_hi_u32 v11, v18, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v14, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v11, v14, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v4, v18, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v14, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v11, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, v11, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v18, v1
-; GFX9-NEXT:    v_add3_u32 v2, v4, v3, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[14:15], s5, v11, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v14, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2]
-; GFX9-NEXT:    v_xor_b32_e32 v8, s18, v5
-; GFX9-NEXT:    v_xor_b32_e32 v9, s19, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v10, s19
-; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s18, v8
-; GFX9-NEXT:    v_xor_b32_e32 v5, s4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v9, v10, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v9, v11, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v13, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v4
+; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, v12, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v3
+; GFX9-NEXT:    v_mul_lo_u32 v11, v13, v4
 ; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_mul_hi_u32 v8, v11, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v9, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v8, v12, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v11, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
+; GFX9-NEXT:    v_add_u32_e32 v8, v11, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v8, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v12, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v13, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[18:19], s5, v7, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v8, v[1:2]
+; GFX9-NEXT:    v_xor_b32_e32 v11, s17, v5
+; GFX9-NEXT:    v_xor_b32_e32 v10, s16, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s20, v7, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v12, s17
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s16, v10
+; GFX9-NEXT:    v_xor_b32_e32 v5, s4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v10, v7, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v11, v12, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v11, v8, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v8, v3
+; GFX9-NEXT:    v_add_u32_e32 v6, v10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v8, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v11, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v12, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s13, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s12, v4
-; GFX9-NEXT:    v_mul_hi_u32 v10, s12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s13, v3
-; GFX9-NEXT:    v_mul_hi_u32 v12, s13, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_add_u32_e32 v10, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v10, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v7, s14, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, s14, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s15, v3
+; GFX9-NEXT:    v_mul_hi_u32 v13, s15, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, s13, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_mul_hi_u32 v8, s12, v4
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, s15, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v4
+; GFX9-NEXT:    v_xor_b32_e32 v9, s4, v9
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v3, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
-; GFX9-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v3, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s4, v5
-; GFX9-NEXT:    v_add_u32_e32 v8, v10, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
-; GFX9-NEXT:    v_add3_u32 v9, v8, v7, v12
+; GFX9-NEXT:    v_add_u32_e32 v7, v10, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v9, v8, vcc
+; GFX9-NEXT:    v_add3_u32 v9, v7, v12, v13
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s2, v9, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v10, s13
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s12, v3
+; GFX9-NEXT:    v_mov_b32_e32 v10, s15
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s14, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s3, v11, v[7:8]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v10, v7, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
-; GFX9-NEXT:    v_sub_u32_e32 v7, s13, v7
+; GFX9-NEXT:    v_sub_u32_e32 v7, s15, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
@@ -1867,7 +1930,7 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v4, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[16:17]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v10
 ; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s1
@@ -1885,295 +1948,312 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-LABEL: sdivrem_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x20
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x20
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s16, s1, 31
 ; GFX10-NEXT:    s_ashr_i32 s4, s13, 31
-; GFX10-NEXT:    s_mov_b32 s17, s16
+; GFX10-NEXT:    s_ashr_i32 s16, s1, 31
 ; GFX10-NEXT:    s_add_u32 s12, s12, s4
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_mov_b32 s17, s16
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_addc_u32 s13, s13, s4
 ; GFX10-NEXT:    s_add_u32 s0, s0, s16
-; GFX10-NEXT:    s_addc_u32 s1, s1, s16
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_addc_u32 s1, s1, s16
 ; GFX10-NEXT:    s_xor_b64 s[6:7], s[0:1], s[16:17]
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
+; GFX10-NEXT:    s_sub_u32 s20, 0, s6
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX10-NEXT:    s_sub_u32 s21, 0, s6
-; GFX10-NEXT:    s_subb_u32 s20, 0, s7
+; GFX10-NEXT:    s_and_b32 s12, s12, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX10-NEXT:    s_subb_u32 s21, 0, s7
 ; GFX10-NEXT:    s_ashr_i32 s12, s15, 31
 ; GFX10-NEXT:    s_xor_b64 s[18:19], s[4:5], s[16:17]
 ; GFX10-NEXT:    s_ashr_i32 s16, s3, 31
 ; GFX10-NEXT:    s_add_u32 s14, s14, s12
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_mov_b32 s17, s16
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s13, s12
 ; GFX10-NEXT:    s_addc_u32 s15, s15, s12
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    s_add_u32 s2, s2, s16
-; GFX10-NEXT:    s_mov_b32 s17, s16
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s16
-; GFX10-NEXT:    s_mov_b32 s13, s12
+; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
 ; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v1
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
-; GFX10-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v9, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v6, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
-; GFX10-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v6
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s21, v7, 0
-; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v3
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v0
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s20, v5, 0
+; GFX10-NEXT:    v_trunc_f32_e32 v6, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s20, v7, v[1:2]
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v6
 ; GFX10-NEXT:    s_sub_u32 s5, 0, s2
+; GFX10-NEXT:    v_mul_hi_u32 v11, v7, v0
+; GFX10-NEXT:    v_mul_hi_u32 v10, v5, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s23, s21, v5, v[1:2]
+; GFX10-NEXT:    v_mul_lo_u32 v4, v7, v0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v10, v9, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s22, s5, v8, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s22, s21, v9, v[1:2]
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v6
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, v7, v0
+; GFX10-NEXT:    s_and_b32 s22, s22, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10-NEXT:    v_mul_lo_u32 v9, v5, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s23, s5, v8, 0
+; GFX10-NEXT:    v_mul_lo_u32 v12, v7, v1
+; GFX10-NEXT:    v_mul_hi_u32 v13, v5, v1
 ; GFX10-NEXT:    s_subb_u32 s22, 0, s3
-; GFX10-NEXT:    v_mul_hi_u32 v12, v8, v2
-; GFX10-NEXT:    v_mul_lo_u32 v11, v5, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s23, s20, v7, v[4:5]
-; GFX10-NEXT:    v_mul_lo_u32 v4, v9, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s5, v5, v[1:2]
-; GFX10-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v13, v7, v3
-; GFX10-NEXT:    v_mul_lo_u32 v14, v9, v3
-; GFX10-NEXT:    v_mul_hi_u32 v15, v7, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s22, v8, v[0:1]
-; GFX10-NEXT:    v_mul_hi_u32 v1, v9, v3
-; GFX10-NEXT:    v_add_co_u32 v3, s23, v4, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v10, s23, v14, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s23
-; GFX10-NEXT:    v_mul_lo_u32 v14, v8, v0
-; GFX10-NEXT:    v_add_co_u32 v3, s23, v3, v6
+; GFX10-NEXT:    v_add_co_u32 v9, s23, v4, v9
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v11, s23, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s23, s5, v6, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v0, s23, v9, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s23
+; GFX10-NEXT:    v_add_co_u32 v4, s23, v11, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s23
+; GFX10-NEXT:    v_mul_hi_u32 v10, v7, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v14, v0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s22, v8, v[3:4]
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v12, v9
+; GFX10-NEXT:    v_mul_hi_u32 v12, v6, v2
+; GFX10-NEXT:    v_add_co_u32 v1, s23, v4, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v6, s23, v10, v15
-; GFX10-NEXT:    v_mul_lo_u32 v15, v5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
-; GFX10-NEXT:    v_mul_hi_u32 v16, v8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v17, v5, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v4, v3
-; GFX10-NEXT:    v_add_co_u32 v4, s23, v11, v14
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v13, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v2, s23, v15, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v0, s23, v6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v4, s23, v4, v12
+; GFX10-NEXT:    v_mul_lo_u32 v4, v6, v2
+; GFX10-NEXT:    v_mul_hi_u32 v11, v8, v2
+; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v5, v1
+; GFX10-NEXT:    v_add3_u32 v2, v9, v3, v10
+; GFX10-NEXT:    v_mul_lo_u32 v3, v8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, v6, v0
+; GFX10-NEXT:    v_mul_hi_u32 v10, v8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, v6, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v2, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s23, s20, v5, 0
+; GFX10-NEXT:    v_add_co_u32 v3, s23, v4, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s23
-; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v16
-; GFX10-NEXT:    v_add3_u32 v1, v3, v6, v1
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v7, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX10-NEXT:    v_add_co_u32 v9, s23, v9, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s23
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v9, v1, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s21, v6, 0
-; GFX10-NEXT:    v_add_co_u32 v2, s23, v2, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v12
+; GFX10-NEXT:    v_add_co_u32 v3, s23, v3, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s23
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v11, v7, v0
-; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v17
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v5, v3, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s23, s5, v8, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s21, s21, v7, v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v12, v9, v2
-; GFX10-NEXT:    v_mul_hi_u32 v13, v8, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s20, s20, v6, v[4:5]
-; GFX10-NEXT:    v_mul_lo_u32 v4, v7, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, v6, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s5, v9, v[1:2]
-; GFX10-NEXT:    v_mul_hi_u32 v2, v9, v2
-; GFX10-NEXT:    v_mul_lo_u32 v14, v6, v3
-; GFX10-NEXT:    v_mul_lo_u32 v15, v7, v3
-; GFX10-NEXT:    v_mul_hi_u32 v16, v6, v3
+; GFX10-NEXT:    v_add_co_u32 v9, s23, v9, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s23
+; GFX10-NEXT:    v_mul_hi_u32 v13, v5, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v3
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s20, s20, v7, v[2:3]
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v12, v10
+; GFX10-NEXT:    v_mul_hi_u32 v11, v7, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s20, v9, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s20
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s20, s21, v5, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v4
+; GFX10-NEXT:    v_add3_u32 v0, v10, v9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, v7, v1
+; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s20, s5, v8, 0
+; GFX10-NEXT:    v_mul_lo_u32 v10, v5, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v0, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v12, v7, v2
+; GFX10-NEXT:    v_mul_hi_u32 v14, v5, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v7, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_add_co_u32 v4, s20, v9, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s20
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s5, v6, v[0:1]
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v4, v13
+; GFX10-NEXT:    v_add_co_u32 v10, s20, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s20
+; GFX10-NEXT:    v_add_co_u32 v4, s5, v10, v14
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v9, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s22, v8, v[0:1]
-; GFX10-NEXT:    v_mul_hi_u32 v1, v7, v3
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v4, v14
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v4, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
+; GFX10-NEXT:    v_mul_lo_u32 v9, v6, v3
+; GFX10-NEXT:    v_mul_hi_u32 v11, v8, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, v6, v3
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v5, v1
+; GFX10-NEXT:    v_add3_u32 v2, v10, v4, v2
+; GFX10-NEXT:    v_mul_lo_u32 v4, v8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v10, v6, v0
+; GFX10-NEXT:    v_mul_hi_u32 v5, v8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v2, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s5, v9, v4
+; GFX10-NEXT:    v_mul_lo_u32 v13, s0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v10, v3
+; GFX10-NEXT:    v_mul_lo_u32 v14, s1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v4, s5, v4, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v15, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v5
-; GFX10-NEXT:    v_mul_lo_u32 v15, v8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v5, s5, v11, v16
-; GFX10-NEXT:    v_mul_lo_u32 v16, v9, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT:    v_mul_hi_u32 v17, v8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, v9, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v14, v11
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v12, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v2, s5, v16, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v5, v3
+; GFX10-NEXT:    v_mul_hi_u32 v15, s0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v11, v13
+; GFX10-NEXT:    v_add_co_u32 v7, s5, v7, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v17
-; GFX10-NEXT:    v_add3_u32 v1, v4, v5, v1
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v12, v11
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v14, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v9, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
-; GFX10-NEXT:    v_add_co_u32 v2, s5, v2, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v14, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT:    v_mul_lo_u32 v11, s0, v1
-; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v3
-; GFX10-NEXT:    v_mul_hi_u32 v3, s1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v12, s1, v1
-; GFX10-NEXT:    v_add3_u32 v0, v5, v4, v0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v1
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v6, v11
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v9, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v3, s5, v12, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v7
-; GFX10-NEXT:    v_mul_lo_u32 v0, s15, v2
-; GFX10-NEXT:    v_mul_lo_u32 v12, s14, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v7, s5, v7, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v15
+; GFX10-NEXT:    v_mul_hi_u32 v0, v6, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v4
-; GFX10-NEXT:    v_mul_hi_u32 v9, s14, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v10, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
-; GFX10-NEXT:    v_mul_lo_u32 v7, s15, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v6, v1
-; GFX10-NEXT:    v_add_co_u32 v6, s5, v0, v12
-; GFX10-NEXT:    v_mul_hi_u32 v13, s14, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v11, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v8, v3
+; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v2
+; GFX10-NEXT:    v_add3_u32 v0, v5, v4, v0
+; GFX10-NEXT:    v_add_co_u32 v4, s5, v1, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v13, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v0, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v7, s15, v3
+; GFX10-NEXT:    v_mul_hi_u32 v8, s14, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s15, v3
+; GFX10-NEXT:    v_mul_lo_u32 v11, s14, v6
+; GFX10-NEXT:    v_add3_u32 v5, v9, v5, v2
+; GFX10-NEXT:    v_mul_lo_u32 v2, s15, v6
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v4, 0
+; GFX10-NEXT:    v_mul_hi_u32 v9, s14, v6
+; GFX10-NEXT:    v_mul_hi_u32 v6, s15, v6
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_add_co_u32 v7, s5, v7, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v12, s5, v3, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s20, v7, v2
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s6, v5, v[1:2]
+; GFX10-NEXT:    v_add_co_u32 v2, s5, v7, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v12, 0
-; GFX10-NEXT:    v_add_co_u32 v6, s5, v6, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v9, s5, v2, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s20
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v4, 1
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v11, v2
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s7, v4, v[1:2]
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v12, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
-; GFX10-NEXT:    v_add3_u32 v4, v4, v7, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v6
-; GFX10-NEXT:    v_mul_hi_u32 v5, s15, v8
-; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v12, 1
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s6, v4, v[1:2]
-; GFX10-NEXT:    v_add_co_u32 v6, s5, v9, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v7, 1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s5, s7, v12, v[1:2]
-; GFX10-NEXT:    v_add3_u32 v5, v3, v9, v5
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s5, s2, v6, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, 0, v8, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s0, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v9, s1, v1
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v15, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v0, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v14, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s0, 0, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v8, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v6, v7, v2, v6
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s1, v1
+; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v13, s0, s1, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s7, v2, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v7, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v3, 0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v16
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v5, v[0:1]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v19, v18, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v20, v17, s0
-; GFX10-NEXT:    v_sub_co_u32 v1, s0, v3, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s3, v6, v[0:1]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v16, v9, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s14, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s1, s15, v0, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s15, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v14, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v8
-; GFX10-NEXT:    v_xor_b32_e32 v1, s18, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT:    v_xor_b32_e32 v4, s19, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
-; GFX10-NEXT:    v_xor_b32_e32 v3, s4, v3
-; GFX10-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v18, v17, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v13
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v18, vcc_lo, s7, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v19, v16, s0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s2, v6, v[1:2]
+; GFX10-NEXT:    v_sub_co_u32 v2, s0, v14, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v17, s0, 0, v18, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v14, v2, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, s3, v3, v[1:2]
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v15, v17, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, s14, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s1, s15, v1, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v9
+; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
+; GFX10-NEXT:    v_xor_b32_e32 v2, s19, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s0
+; GFX10-NEXT:    v_xor_b32_e32 v5, s4, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v2, s2
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v8, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v11, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v1, s18
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v8
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v9
+; GFX10-NEXT:    v_xor_b32_e32 v2, s4, v7
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v11, vcc_lo, s3, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v9, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v15, s0, v6, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v5, s0
+; GFX10-NEXT:    v_add_co_u32 v15, s0, v3, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v6, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s0
 ; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    v_sub_co_u32 v9, s0, v13, s2
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_sub_co_u32 v7, s0, v13, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v11, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v9, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v14, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v5, v15, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v14, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v15, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s0
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s4
-; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v6
-; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v11
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v7, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v2
-; GFX10-NEXT:    v_xor_b32_e32 v8, s12, v8
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v8, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v2, s4
+; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v3
+; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v6
+; GFX10-NEXT:    v_xor_b32_e32 v6, s12, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v5, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v7
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[8:9]
 ; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[10:11]
 ; GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..611516bf5f21d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -72,18 +72,27 @@ define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) {
 ; GFX8-LABEL: s_shl_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i8:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i8 %value, %amount
@@ -1731,10 +1740,12 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[0:1], s3
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
+; GCN-NEXT:    s_and_b32 s3, s11, 1
+; GCN-NEXT:    s_cmp_lg_u32 s3, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
 ; GCN-NEXT:    s_cselect_b32 s3, s6, s8
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
+; GCN-NEXT:    s_and_b32 s4, s12, 1
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cselect_b32 s2, s2, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
@@ -1749,12 +1760,14 @@ define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[2:3], s3
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[0:1], s3
+; GFX10PLUS-NEXT:    s_and_b32 s3, s11, 1
 ; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[0:1], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[8:9], 0
 ; GFX10PLUS-NEXT:    s_cselect_b32 s3, s4, s6
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s12, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i65 %value, %amount
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index cfac0c2fa56aaf..797924be225388 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -199,8 +199,14 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_ashr_i32 s6, s3, 31
 ; CHECK-NEXT:    s_ashr_i32 s0, s5, 31
 ; CHECK-NEXT:    s_add_u32 s10, s2, s6
+; CHECK-NEXT:    s_cselect_b32 s7, 1, 0
+; CHECK-NEXT:    s_and_b32 s7, s7, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s7, 0
 ; CHECK-NEXT:    s_addc_u32 s11, s3, s6
 ; CHECK-NEXT:    s_add_u32 s8, s4, s0
+; CHECK-NEXT:    s_cselect_b32 s3, 1, 0
+; CHECK-NEXT:    s_and_b32 s3, s3, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s3, 0
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_addc_u32 s9, s5, s0
 ; CHECK-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
@@ -211,34 +217,37 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    s_sub_u32 s3, 0, s8
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s9
-; CHECK-NEXT:    s_mov_b32 s7, 0
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; CHECK-NEXT:    s_subb_u32 s5, 0, s9
+; CHECK-NEXT:    s_mov_b32 s7, 0
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
 ; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
@@ -1107,9 +1116,12 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
@@ -1119,23 +1131,23 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v4
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
@@ -1201,31 +1213,34 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v5
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v13, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v16, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
@@ -1702,9 +1717,12 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
@@ -1714,23 +1732,23 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v4
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v6, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
@@ -1796,31 +1814,34 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
 ; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v5
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v13, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v16, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 2572f8581f0edf..cd42fc86f3979b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4186,6 +4186,9 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i48:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s4, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s5, s5, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    s_subb_u32 s3, s1, s3
 ; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -4211,6 +4214,9 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i48:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s4, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    s_subb_u32 s3, s1, s3
 ; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x300000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -4238,6 +4244,9 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_sub_u32 s4, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4262,15 +4271,18 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10-NEXT:    s_sub_u32 s4, s0, s2
-; GFX10-NEXT:    s_subb_u32 s5, s1, s3
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX10-NEXT:    s_add_i32 s3, s2, 0x80000000
-; GFX10-NEXT:    s_xor_b32 s0, s1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
+; GFX10-NEXT:    s_ashr_i32 s1, s5, 31
+; GFX10-NEXT:    s_add_i32 s3, s1, 0x80000000
+; GFX10-NEXT:    s_xor_b32 s0, s2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4282,14 +4294,17 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX11-NEXT:    s_sub_u32 s4, s0, s2
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX11-NEXT:    s_add_i32 s3, s2, 0x80000000
-; GFX11-NEXT:    s_xor_b32 s0, s1, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
+; GFX11-NEXT:    s_ashr_i32 s1, s5, 31
+; GFX11-NEXT:    s_add_i32 s3, s1, 0x80000000
+; GFX11-NEXT:    s_xor_b32 s0, s2, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s3, s0
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4566,6 +4581,9 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s4, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_and_b32 s5, s5, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4587,6 +4605,9 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s4, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4608,6 +4629,9 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX9-LABEL: s_ssubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s4, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -4629,8 +4653,11 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX10-LABEL: s_ssubsat_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s4, s0, s2
-; GFX10-NEXT:    s_subb_u32 s5, s1, s3
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
@@ -4646,6 +4673,9 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX11-LABEL: s_ssubsat_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_sub_u32 s4, s0, s2
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
@@ -4929,6 +4959,9 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_ssubsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s8, s0, s4
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_and_b32 s9, s9, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
@@ -4936,14 +4969,17 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX6-NEXT:    s_add_i32 s5, s4, 0x80000000
+; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT:    s_sub_u32 s0, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
-; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX6-NEXT:    s_sub_u32 s0, s2, s6
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -4967,6 +5003,9 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-LABEL: s_ssubsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s8, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_and_b32 s9, s9, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -4974,14 +5013,17 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX8-NEXT:    s_add_i32 s5, s4, 0x80000000
+; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT:    s_sub_u32 s0, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT:    s_sub_u32 s0, s2, s6
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -5005,6 +5047,9 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-LABEL: s_ssubsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s8, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_and_b32 s9, s9, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -5012,14 +5057,17 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX9-NEXT:    s_add_i32 s5, s4, 0x80000000
+; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    s_sub_u32 s0, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX9-NEXT:    s_sub_u32 s0, s2, s6
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -5043,8 +5091,11 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-LABEL: s_ssubsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s8, s0, s4
-; GFX10-NEXT:    s_subb_u32 s9, s1, s5
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[4:5], 0
 ; GFX10-NEXT:    s_ashr_i32 s4, s9, 31
@@ -5052,14 +5103,17 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX10-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX10-NEXT:    s_sub_u32 s0, s2, s6
-; GFX10-NEXT:    s_subb_u32 s1, s3, s7
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
+; GFX10-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
 ; GFX10-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX10-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
@@ -5073,6 +5127,9 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-LABEL: s_ssubsat_v2i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_sub_u32 s8, s0, s4
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_and_b32 s9, s9, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
@@ -5081,13 +5138,16 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX11-NEXT:    s_add_i32 s5, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s8, s1, s0
 ; GFX11-NEXT:    s_sub_u32 s0, s2, s6
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_subb_u32 s1, s3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s8
 ; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s8
 ; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    s_xor_b32 s1, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s1
@@ -5105,10 +5165,19 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s8, s0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_and_b32 s9, s9, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    s_subb_u32 s9, s1, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX6-NEXT:    s_and_b32 s10, s10, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_subb_u32 s10, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_and_b32 s11, s11, 1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s11, s3, s7
@@ -5148,9 +5217,18 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_ssubsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s8, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_and_b32 s9, s9, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    s_subb_u32 s9, s1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX8-NEXT:    s_and_b32 s10, s10, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_subb_u32 s10, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_and_b32 s11, s11, 1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -5197,9 +5275,18 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_ssubsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s8, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_and_b32 s9, s9, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX9-NEXT:    s_subb_u32 s9, s1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-NEXT:    s_and_b32 s10, s10, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_subb_u32 s10, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_and_b32 s11, s11, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_subb_u32 s11, s3, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -5246,26 +5333,35 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-LABEL: s_ssubsat_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_sub_u32 s8, s0, s4
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_and_b32 s9, s9, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX10-NEXT:    s_subb_u32 s9, s1, s5
-; GFX10-NEXT:    s_subb_u32 s10, s2, s6
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
+; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_subb_u32 s10, s2, s6
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    s_and_b32 s11, s11, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    s_subb_u32 s11, s3, s7
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s12
-; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s1, s[4:5], 0
+; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
 ; GFX10-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
@@ -5289,26 +5385,35 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-LABEL: s_ssubsat_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_sub_u32 s8, s0, s4
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_and_b32 s9, s9, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX11-NEXT:    s_subb_u32 s9, s1, s5
-; GFX11-NEXT:    s_subb_u32 s10, s2, s6
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
+; GFX11-NEXT:    s_and_b32 s10, s10, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX11-NEXT:    s_subb_u32 s10, s2, s6
+; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s11, s11, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s3, s7
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s12
-; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_and_b32 s0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s1, s[4:5], 0
+; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_and_b32 s1, 1, s1
+; GFX11-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
@@ -5943,10 +6048,19 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_ssubsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s16, s0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_and_b32 s17, s17, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_subb_u32 s17, s1, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_and_b32 s18, s18, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX6-NEXT:    s_subb_u32 s18, s2, s10
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s1
+; GFX6-NEXT:    s_and_b32 s19, s19, 1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s19, s3, s11
@@ -5962,26 +6076,35 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s19, 31
-; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_add_i32 s1, s0, 0x80000000
+; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_sub_u32 s0, s4, s12
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX6-NEXT:    s_and_b32 s1, s1, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX6-NEXT:    s_subb_u32 s1, s5, s13
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s19
+; GFX6-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX6-NEXT:    s_sub_u32 s0, s4, s12
+; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    s_subb_u32 s1, s5, s13
+; GFX6-NEXT:    s_and_b32 s3, s3, 1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    s_subb_u32 s2, s6, s14
+; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; GFX6-NEXT:    s_subb_u32 s3, s7, s15
@@ -6025,9 +6148,18 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_ssubsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s16, s0, s8
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_and_b32 s17, s17, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_subb_u32 s17, s1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_and_b32 s18, s18, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX8-NEXT:    s_subb_u32 s18, s2, s10
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_and_b32 s19, s19, 1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -6050,25 +6182,34 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s19, 31
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    s_sub_u32 s0, s4, s12
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_subb_u32 s1, s5, s13
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s17
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s19
-; GFX8-NEXT:    s_sub_u32 s0, s4, s12
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX8-NEXT:    s_subb_u32 s1, s5, s13
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_subb_u32 s2, s6, s14
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -6119,9 +6260,18 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_ssubsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s16, s0, s8
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_and_b32 s17, s17, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_subb_u32 s17, s1, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_and_b32 s18, s18, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX9-NEXT:    s_subb_u32 s18, s2, s10
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_and_b32 s19, s19, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_subb_u32 s19, s3, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -6144,25 +6294,34 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s19, 31
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s4, s12
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_subb_u32 s1, s5, s13
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s17
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_subb_u32 s2, s6, s14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
-; GFX9-NEXT:    s_sub_u32 s0, s4, s12
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX9-NEXT:    s_subb_u32 s1, s5, s13
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    s_subb_u32 s2, s6, s14
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    s_subb_u32 s3, s7, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
@@ -6212,44 +6371,62 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ;
 ; GFX10-LABEL: s_ssubsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_sub_u32 s18, s0, s8
-; GFX10-NEXT:    s_subb_u32 s19, s1, s9
-; GFX10-NEXT:    s_subb_u32 s16, s2, s10
-; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
-; GFX10-NEXT:    s_subb_u32 s17, s3, s11
-; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_sub_u32 s16, s0, s8
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_and_b32 s17, s17, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_subb_u32 s17, s1, s9
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
+; GFX10-NEXT:    s_and_b32 s18, s18, 1
+; GFX10-NEXT:    v_cmp_gt_u64_e64 s1, s[8:9], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_subb_u32 s18, s2, s10
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
+; GFX10-NEXT:    s_and_b32 s19, s19, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_subb_u32 s19, s3, s11
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
+; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s20
 ; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_ashr_i32 s8, s17, 31
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_ashr_i32 s8, s19, 31
 ; GFX10-NEXT:    s_add_i32 s9, s8, 0x80000000
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[10:11], 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX10-NEXT:    s_and_b32 s1, 1, s0
 ; GFX10-NEXT:    s_sub_u32 s0, s4, s12
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s5, s13
-; GFX10-NEXT:    s_subb_u32 s2, s6, s14
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    s_subb_u32 s3, s7, s15
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s1
+; GFX10-NEXT:    s_subb_u32 s2, s6, s14
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_subb_u32 s3, s7, s15
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
+; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX10-NEXT:    v_cmp_gt_u64_e64 s6, s[12:13], 0
 ; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10-NEXT:    s_and_b32 s4, 1, s10
@@ -6265,12 +6442,12 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v3, s18
-; GFX10-NEXT:    v_mov_b32_e32 v4, s19
+; GFX10-NEXT:    v_mov_b32_e32 v3, s16
+; GFX10-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s17
+; GFX10-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
@@ -6295,42 +6472,60 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-LABEL: s_ssubsat_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_sub_u32 s16, s0, s8
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_and_b32 s17, s17, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_subb_u32 s17, s1, s9
-; GFX11-NEXT:    s_subb_u32 s18, s2, s10
+; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
+; GFX11-NEXT:    s_and_b32 s18, s18, 1
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s1, s[8:9], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_subb_u32 s18, s2, s10
+; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s19, s19, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX11-NEXT:    s_subb_u32 s19, s3, s11
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s20
 ; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s8, s19, 31
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[10:11], 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    s_and_b32 s1, 1, s0
 ; GFX11-NEXT:    s_sub_u32 s0, s4, s12
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v5, s0
+; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_subb_u32 s1, s5, s13
-; GFX11-NEXT:    s_subb_u32 s2, s6, s14
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX11-NEXT:    s_subb_u32 s3, s7, s15
-; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    s_subb_u32 s2, s6, s14
+; GFX11-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX11-NEXT:    s_and_b32 s3, s3, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-NEXT:    s_subb_u32 s3, s7, s15
+; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
+; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
 ; GFX11-NEXT:    v_cmp_gt_u64_e64 s6, s[12:13], 0
 ; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX11-NEXT:    s_and_b32 s4, 1, s10
 ; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 2d85081f5fc969..f1d06f35ffc333 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -71,50 +71,54 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s4
+; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s2, 8
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s1
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s5
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s6
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s7
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
+; GFX9-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s7
-; GFX9-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:12
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:13
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:14
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
@@ -187,43 +191,47 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX10-NEXT:    s_and_b32 s3, 0xffff, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 8
-; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s6
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:    s_lshr_b32 s1, s3, 8
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s3, 8
 ; GFX10-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX10-NEXT:    v_mov_b32_e32 v7, s6
-; GFX10-NEXT:    v_mov_b32_e32 v8, s1
-; GFX10-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX10-NEXT:    ds_write_b8 v1, v0
 ; GFX10-NEXT:    ds_write_b8 v1, v2 offset:4
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    ds_write_b8 v1, v4 offset:2
 ; GFX10-NEXT:    ds_write_b8 v1, v5 offset:6
 ; GFX10-NEXT:    ds_write_b8 v1, v6 offset:1
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:3
-; GFX10-NEXT:    ds_write_b8 v1, v8 offset:5
+; GFX10-NEXT:    ds_write_b8 v1, v7 offset:5
+; GFX10-NEXT:    ds_write_b8 v1, v0 offset:3
+; GFX10-NEXT:    ds_write_b8 v1, v2 offset:7
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v10, s0
-; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX10-NEXT:    ds_write_b8 v1, v9 offset:7
-; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT:    v_mov_b32_e32 v8, s0
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s4
+; GFX10-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s7
-; GFX10-NEXT:    s_lshr_b32 s1, s7, 16
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
-; GFX10-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    ds_write_b8 v1, v8 offset:9
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:11
 ; GFX10-NEXT:    ds_write_b8 v1, v2 offset:12
 ; GFX10-NEXT:    ds_write_b8 v1, v3 offset:13
@@ -237,41 +245,44 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x10
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s4
 ; GFX11-NEXT:    s_lshr_b32 s1, s4, 16
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
 ; GFX11-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s6
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
 ; GFX11-NEXT:    s_lshr_b32 s1, s3, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s0
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s1
+; GFX11-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX11-NEXT:    s_lshr_b32 s1, s3, 8
 ; GFX11-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX11-NEXT:    v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s3
+; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s1
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s4
 ; GFX11-NEXT:    ds_store_b8 v1, v0
 ; GFX11-NEXT:    ds_store_b8 v1, v6 offset:1
 ; GFX11-NEXT:    ds_store_b8 v1, v4 offset:2
-; GFX11-NEXT:    ds_store_b8 v1, v7 offset:3
+; GFX11-NEXT:    ds_store_b8 v1, v8 offset:3
 ; GFX11-NEXT:    ds_store_b8 v1, v2 offset:4
-; GFX11-NEXT:    ds_store_b8 v1, v8 offset:5
+; GFX11-NEXT:    ds_store_b8 v1, v7 offset:5
 ; GFX11-NEXT:    ds_store_b8 v1, v5 offset:6
 ; GFX11-NEXT:    ds_store_b8 v1, v9 offset:7
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
-; GFX11-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX11-NEXT:    s_lshr_b32 s0, s1, 8
 ; GFX11-NEXT:    s_lshr_b32 s1, s7, 16
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s7
-; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s1
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s0, s2, 8
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX11-NEXT:    ds_store_b8 v1, v3 offset:8
 ; GFX11-NEXT:    ds_store_b8 v1, v0 offset:9
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
index 4ef79b752c4373..ccaf27ddfc5cea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll
@@ -67,38 +67,41 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s4
+; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s2, 8
-; GFX9-NEXT:    s_lshr_b32 s1, s4, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s1
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s5
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s6
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
+; GFX9-NEXT:    s_lshr_b32 s0, s6, 16
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    s_lshr_b32 s0, s6, 16
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
-; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
@@ -163,33 +166,36 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
 ; GFX10-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s6
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s6
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:    s_lshr_b32 s1, s3, 8
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s0
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX10-NEXT:    v_mov_b32_e32 v9, s3
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s3, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s2
-; GFX10-NEXT:    v_mov_b32_e32 v10, s0
-; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v7, s6
-; GFX10-NEXT:    v_mov_b32_e32 v8, s1
+; GFX10-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX10-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s4
 ; GFX10-NEXT:    ds_write_b8 v1, v0
 ; GFX10-NEXT:    ds_write_b8 v1, v2 offset:4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    ds_write_b8 v1, v4 offset:2
 ; GFX10-NEXT:    ds_write_b8 v1, v5 offset:6
 ; GFX10-NEXT:    ds_write_b8 v1, v6 offset:1
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:3
-; GFX10-NEXT:    ds_write_b8 v1, v8 offset:5
-; GFX10-NEXT:    v_mov_b32_e32 v0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s0
-; GFX10-NEXT:    ds_write_b8 v1, v9 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v7 offset:5
+; GFX10-NEXT:    ds_write_b8 v1, v0 offset:3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    ds_write_b8 v1, v2 offset:7
 ; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
-; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
-; GFX10-NEXT:    ds_write_b8 v1, v2 offset:11
+; GFX10-NEXT:    ds_write_b8 v1, v0 offset:9
+; GFX10-NEXT:    ds_write_b8 v1, v4 offset:10
+; GFX10-NEXT:    ds_write_b8 v1, v5 offset:11
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_lds_v3i32_align1:
@@ -198,36 +204,39 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x10
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s4
 ; GFX11-NEXT:    s_lshr_b32 s1, s4, 16
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s4
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    s_lshr_b32 s4, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 8
 ; GFX11-NEXT:    s_lshr_b32 s0, s5, 16
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
+; GFX11-NEXT:    s_lshr_b32 s4, s6, 16
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s6
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s2
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
 ; GFX11-NEXT:    s_lshr_b32 s1, s3, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s0
+; GFX11-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s4
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s1
+; GFX11-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX11-NEXT:    s_lshr_b32 s1, s3, 8
+; GFX11-NEXT:    v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s2
 ; GFX11-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s4, 8
-; GFX11-NEXT:    v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s1
-; GFX11-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s0
-; GFX11-NEXT:    v_mov_b32_e32 v12, s5
+; GFX11-NEXT:    v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v11, s4
+; GFX11-NEXT:    v_mov_b32_e32 v12, s0
 ; GFX11-NEXT:    ds_store_b8 v1, v0
-; GFX11-NEXT:    ds_store_b8 v1, v7 offset:1
+; GFX11-NEXT:    ds_store_b8 v1, v6 offset:1
 ; GFX11-NEXT:    ds_store_b8 v1, v4 offset:2
-; GFX11-NEXT:    ds_store_b8 v1, v8 offset:3
+; GFX11-NEXT:    ds_store_b8 v1, v9 offset:3
 ; GFX11-NEXT:    ds_store_b8 v1, v2 offset:4
-; GFX11-NEXT:    ds_store_b8 v1, v9 offset:5
+; GFX11-NEXT:    ds_store_b8 v1, v7 offset:5
 ; GFX11-NEXT:    ds_store_b8 v1, v5 offset:6
 ; GFX11-NEXT:    ds_store_b8 v1, v10 offset:7
 ; GFX11-NEXT:    ds_store_b8 v1, v3 offset:8
-; GFX11-NEXT:    ds_store_b8 v1, v11 offset:9
-; GFX11-NEXT:    ds_store_b8 v1, v6 offset:10
+; GFX11-NEXT:    ds_store_b8 v1, v8 offset:9
+; GFX11-NEXT:    ds_store_b8 v1, v11 offset:10
 ; GFX11-NEXT:    ds_store_b8 v1, v12 offset:11
 ; GFX11-NEXT:    s_endpgm
   store <3 x i32> %x, ptr addrspace(3) %out, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index 3741983a3067b8..f451c24544fedf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -449,6 +449,7 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_sub_u32 s0, s0, s1
 ; GFX7-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX7-NEXT:    s_and_b32 s1, s1, 1
 ; GFX7-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -456,6 +457,7 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -463,6 +465,7 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s1
 ; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
   %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@@ -477,27 +480,48 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_usubo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_sub_u32 s0, s0, s2
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_subb_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
 ; GFX7-NEXT:    s_sub_u32 s0, s0, s2
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX7-NEXT:    s_subb_u32 s1, s1, 0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
   %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
@@ -515,6 +539,8 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX7-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, s2, 1
+; GFX7-NEXT:    s_and_b32 s3, s3, 1
 ; GFX7-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX7-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -525,6 +551,8 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -535,6 +563,8 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_sub_u32 s1, s1, s3
 ; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX9-NEXT:    s_sub_i32 s1, s1, s3
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -679,6 +709,9 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX7-LABEL: s_ssubo_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_sub_u32 s4, s0, s2
+; GFX7-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX7-NEXT:    s_and_b32 s5, s5, 1
+; GFX7-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
@@ -696,6 +729,9 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX8-LABEL: s_ssubo_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s4, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_and_b32 s5, s5, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -713,6 +749,9 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
 ; GFX9-LABEL: s_ssubo_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s4, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_and_b32 s5, s5, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_subb_u32 s5, s1, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 788692c94b0cfa..f6a7386f9af5dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2657,6 +2657,7 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s1, s3
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -2666,6 +2667,7 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s3
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2675,7 +2677,13 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -2685,7 +2693,13 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -2695,7 +2709,13 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s2
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -2865,28 +2885,52 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s3
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_uaddsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_uaddsat_i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s2
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
@@ -3046,40 +3090,88 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_uaddsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    s_add_u32 s2, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_uaddsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s7
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_uaddsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s2, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s7
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_uaddsat_v2i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s6
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s3, s3, s7
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
@@ -3090,9 +3182,21 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -3100,9 +3204,21 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_uaddsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_addc_u32 s2, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s7
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -3110,9 +3226,21 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_uaddsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_addc_u32 s2, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s7
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -3120,9 +3248,21 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10PLUS-LABEL: s_uaddsat_i128:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s2, s2, s6
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s3, s3, s7
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -3367,15 +3507,39 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_uaddsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_add_u32 s0, s0, s8
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s1, s1, s9
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s2, s2, s10
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s11
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX6-NEXT:    s_add_u32 s4, s4, s12
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s5, s5, s13
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_addc_u32 s7, s7, s15
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -3383,15 +3547,39 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_uaddsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_add_u32 s0, s0, s8
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, s9
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s2, s2, s10
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s11
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX8-NEXT:    s_add_u32 s4, s4, s12
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s5, s5, s13
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s6, s6, s14
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_addc_u32 s7, s7, s15
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -3399,15 +3587,39 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_uaddsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_add_u32 s0, s0, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s1, s1, s9
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s2, s2, s10
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s11
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s5, s5, s13
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s6, s6, s14
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_addc_u32 s7, s7, s15
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -3415,15 +3627,39 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10PLUS-LABEL: s_uaddsat_v2i128:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_add_u32 s0, s0, s8
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s9
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s2, s2, s10
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s3, s3, s11
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], -1, s[0:1]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], -1, s[2:3]
 ; GFX10PLUS-NEXT:    s_add_u32 s4, s4, s12
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s5, s5, s13
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s6, s6, s14
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_addc_u32 s7, s7, s15
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], -1, s[4:5]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[6:7], -1, s[6:7]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 1ee521b3dedac1..935628ce31230e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -198,12 +198,15 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
-; CHECK-NEXT:    s_subb_u32 s5, 0, s3
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v1
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 5aef6679347094..4ac58514d47e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -117,10 +117,13 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX8-NEXT:    s_sub_u32 s2, 0, s10
-; GFX8-NEXT:    s_subb_u32 s3, 0, s11
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_subb_u32 s3, 0, s11
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
@@ -144,9 +147,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -256,10 +259,13 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s14
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s14
-; GFX9-NEXT:    s_subb_u32 s3, 0, s15
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -389,7 +395,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s15
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s14
 ; GFX10-NEXT:    s_sub_u32 s0, 0, s14
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_subb_u32 s1, 0, s15
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -399,9 +409,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s0, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
-; GFX10-NEXT:    s_subb_u32 s1, 0, s15
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v3, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, s0, v4, v[1:2]
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v4, v0
@@ -985,10 +994,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s12
 ; GFX8-NEXT:    s_sub_u32 s2, 0, s12
-; GFX8-NEXT:    s_subb_u32 s3, 0, s13
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX8-NEXT:    s_subb_u32 s3, 0, s13
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1012,9 +1024,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -1030,7 +1042,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    s_subb_u32 s3, 0, s15
+; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX8-NEXT:    s_and_b32 s3, s3, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
@@ -1054,6 +1068,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT:    s_subb_u32 s3, 0, s15
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1084,37 +1099,37 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s15
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s14
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v3, s[0:1]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s14
 ; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v8
-; GFX8-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v6
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v14, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v14
-; GFX8-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v1
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v14, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v3, v16, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3]
-; GFX8-NEXT:    v_add_u32_e64 v17, s[0:1], 1, v12
-; GFX8-NEXT:    v_addc_u32_e64 v18, s[0:1], 0, v13, s[0:1]
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v3, v2, s[0:1]
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX8-NEXT:    v_trunc_f32_e32 v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v3
+; GFX8-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v1
+; GFX8-NEXT:    v_add_u32_e64 v16, s[0:1], 1, v12
+; GFX8-NEXT:    v_addc_u32_e64 v17, s[0:1], 0, v13, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v18, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v18, v[2:3]
 ; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v15, v[2:3]
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v5, v4, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, v14, v1
+; GFX8-NEXT:    v_mul_lo_u32 v4, v18, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v15, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v10
 ; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
@@ -1123,8 +1138,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, v14, v2
-; GFX8-NEXT:    v_mul_hi_u32 v1, v14, v1
+; GFX8-NEXT:    v_mul_lo_u32 v4, v18, v2
+; GFX8-NEXT:    v_mul_hi_u32 v1, v18, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
@@ -1132,25 +1147,25 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT:    v_mul_hi_u32 v2, v14, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, v14, v2, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v12, v17, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, v18, v2, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v12, v16, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2]
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v13, v18, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v13, v17, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v15, v[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v12, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v19, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, v14, v3
+; GFX8-NEXT:    v_mul_lo_u32 v7, v18, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v15, v3
@@ -1159,8 +1174,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v8, v14, v4
-; GFX8-NEXT:    v_mul_hi_u32 v3, v14, v3
+; GFX8-NEXT:    v_mul_lo_u32 v8, v18, v4
+; GFX8-NEXT:    v_mul_hi_u32 v3, v18, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v9, v15, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
@@ -1168,13 +1183,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
-; GFX8-NEXT:    v_mul_hi_u32 v4, v14, v4
+; GFX8-NEXT:    v_mul_hi_u32 v4, v18, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v18, v4, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
@@ -1254,10 +1269,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s17
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s16
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s16
-; GFX9-NEXT:    s_subb_u32 s3, 0, s17
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_subb_u32 s3, 0, s17
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
@@ -1298,7 +1316,9 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    s_subb_u32 s3, 0, s19
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_and_b32 s3, s3, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1342,47 +1362,48 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s13
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_subb_u32 s3, 0, s19
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s12, v1
 ; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s13, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s19
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s18
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s19
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v4, v5, s[0:1]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s18
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s16, v2
-; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v8
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v15, v4
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v15
-; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s17, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, v5, v17, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
-; GFX9-NEXT:    v_add_co_u32_e64 v18, s[0:1], 1, v13
-; GFX9-NEXT:    v_addc_co_u32_e64 v19, s[0:1], 0, v14, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v5, v4, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0xcf800000, v5
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v3
+; GFX9-NEXT:    v_add_co_u32_e64 v17, s[0:1], 1, v13
+; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[0:1], 0, v14, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v19, v5
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v19, v[4:5]
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v16, v[4:5]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, v19, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v16, v4
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v20, vcc, s16, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
@@ -1391,11 +1412,11 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, v19, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v19, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v16, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v15, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v19, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
@@ -1406,17 +1427,17 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v3
 ; GFX9-NEXT:    v_add3_u32 v4, v6, v5, v4
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v16, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v18, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v19, vcc, v19, v4, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v13, v17, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v14, v19, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v19, v[3:4]
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], s3, v16, v[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v13, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, v15, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v19, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v16, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v20, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v16, v5
@@ -1425,11 +1446,11 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v11, v15, v6
-; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v19, v6
+; GFX9-NEXT:    v_mul_hi_u32 v5, v19, v5
 ; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v16, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, v15, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v19, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v11, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
@@ -1439,7 +1460,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v16, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v19, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v2, v7, s[0:1]
@@ -1514,17 +1535,25 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s17
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s19
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s16
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s19
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s16
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s18
 ; GFX10-NEXT:    s_sub_u32 s0, 0, s16
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v2, v3
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, 0, s17
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX10-NEXT:    s_sub_u32 s2, 0, s18
+; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10-NEXT:    s_subb_u32 s3, 0, s19
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
@@ -1539,14 +1568,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, s0, v7, 0
-; GFX10-NEXT:    s_sub_u32 s2, 0, s18
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s3, s2, v8, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, s0, v7, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, s2, v8, 0
 ; GFX10-NEXT:    v_mul_hi_u32 v11, v9, v0
-; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
+; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s4, s0, v9, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, s2, v10, v[3:4]
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v0
-; GFX10-NEXT:    s_subb_u32 s3, 0, s19
 ; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, s1, v7, v[4:5]
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v7, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, s3, v8, v[5:6]
@@ -1693,71 +1720,71 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, s12, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s13, v3
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s16, v14
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s13, v3
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s0
 ; GFX10-NEXT:    v_sub_co_u32 v15, s0, s14, v2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v14, s16
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v15
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s15, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v17, vcc_lo, v14, s16
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v17, s1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v5
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v18
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s15, v0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s16, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v22, s0, s19, v0, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s16, v17
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s17, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s1
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s19, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v21, v20, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v20, v19, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s17, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v17, s16
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s18
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v17, v0, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v18, v19, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v3, s1, v2, s16
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v6, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s1, 0, v1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s18, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s1
+; GFX10-NEXT:    v_sub_co_u32 v8, s1, v15, s18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s2, 0, v22, s1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s19, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v21, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s19, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s18, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v13, vcc_lo, v10, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v13, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s19, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v13, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v6, s18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v22, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_sub_co_u32 v3, s1, v8, s18
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v14, v17, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v18, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v9, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v12, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v13, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v8, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v8, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v9, s1
 ; GFX10-NEXT:    global_store_dwordx4 v11, v[0:3], s[8:9]
 ; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[10:11]
 ; GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index a7e5ce3d216199..6741d78c5c17b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -195,12 +195,15 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; CHECK-NEXT:    s_sub_u32 s4, 0, s2
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_madmk_f32 v1, v1, 0x4f800000, v2
-; CHECK-NEXT:    s_subb_u32 s5, 0, s3
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v1
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
 ; CHECK-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v4
@@ -1098,18 +1101,24 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; GISEL-NEXT:    s_sub_u32 s4, 0, 0x12d8fb
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    s_subb_u32 s5, 0, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, 0
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
+; GISEL-NEXT:    s_cselect_b32 s7, 1, 0
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    s_and_b32 s7, s7, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GISEL-NEXT:    s_cmp_lg_u32 s7, 0
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v10, s5, v5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0042d34e235d17..1173716f1704a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2527,6 +2527,7 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_subb_u32 s2, s1, s3
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -2536,6 +2537,7 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s3
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2545,7 +2547,13 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -2555,7 +2563,13 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -2565,7 +2579,13 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -2733,28 +2753,52 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s2
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s3
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_and_b32 s2, s2, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_and_b32 s2, s2, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
@@ -2914,40 +2958,88 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6-LABEL: s_usubsat_v2i64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX6-NEXT:    s_sub_u32 s2, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX8-NEXT:    s_sub_u32 s2, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s7
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s7
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_v2i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX10PLUS-NEXT:    s_sub_u32 s2, s2, s6
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
@@ -2958,9 +3050,21 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s5
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_subb_u32 s2, s2, s6
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s7
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_and_b32 s4, s4, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -2968,9 +3072,21 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-LABEL: s_usubsat_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s5
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_subb_u32 s2, s2, s6
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s7
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_and_b32 s4, s4, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -2978,9 +3094,21 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-LABEL: s_usubsat_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s5
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_subb_u32 s2, s2, s6
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s7
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -2988,9 +3116,21 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10PLUS-LABEL: s_usubsat_i128:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s4
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s6
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -3235,15 +3375,39 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-LABEL: s_usubsat_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s8
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s9
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_subb_u32 s2, s2, s10
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s11
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s12
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s13
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_subb_u32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_subb_u32 s7, s7, s15
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_and_b32 s8, s8, 1
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -3251,15 +3415,39 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-LABEL: s_usubsat_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s8
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s9
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_subb_u32 s2, s2, s10
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s11
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX8-NEXT:    s_sub_u32 s4, s4, s12
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_subb_u32 s5, s5, s13
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_subb_u32 s6, s6, s14
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_subb_u32 s7, s7, s15
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_and_b32 s8, s8, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
@@ -3267,15 +3455,39 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-LABEL: s_usubsat_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s8
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s9
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_subb_u32 s2, s2, s10
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s11
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s4, s4, s12
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_subb_u32 s5, s5, s13
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_subb_u32 s6, s6, s14
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s15
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_and_b32 s8, s8, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -3283,15 +3495,39 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10PLUS-LABEL: s_usubsat_v2i128:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s8
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s9
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s10
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s11
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
 ; GFX10PLUS-NEXT:    s_sub_u32 s4, s4, s12
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s5, s5, s13
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s6, s6, s14
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_subb_u32 s7, s7, s15
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_and_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 1f1c2659e81103..e6c9ad62b7b487 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -431,9 +431,15 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2c
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_cmp_lt_i32 s0, 3
+; GFX906-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX906-NEXT:    s_and_b32 s1, s1, 1
+; GFX906-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX906-NEXT:    s_cbranch_scc0 .LBB7_3
 ; GFX906-NEXT:  ; %bb.1: ; %LeafBlock
 ; GFX906-NEXT:    s_cmp_ge_i32 s0, 1
+; GFX906-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX906-NEXT:    s_and_b32 s0, s0, 1
+; GFX906-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
 ; GFX906-NEXT:  ; %bb.2:
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
@@ -441,6 +447,9 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    s_branch .LBB7_5
 ; GFX906-NEXT:  .LBB7_3: ; %LeafBlock5
 ; GFX906-NEXT:    s_cmp_eq_u32 s0, 3
+; GFX906-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX906-NEXT:    s_and_b32 s0, s0, 1
+; GFX906-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX906-NEXT:    s_cbranch_scc0 .LBB7_6
 ; GFX906-NEXT:  ; %bb.4: ; %sw.bb5
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 6bb4e2d3dbe26e..67bf9fbcf44a63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -1,4 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+Git s; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
@@ -181,6 +182,9 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
 ; GCN-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
 ; GCN-NEXT:    s_not_b64 s[4:5], s[2:3]
 ; GCN-NEXT:    s_add_u32 s2, s2, s0
+; GCN-NEXT:    s_cselect_b32 s0, 1, 0
+; GCN-NEXT:    s_and_b32 s0, s0, 1
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s3, s3, s1
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
@@ -191,8 +195,11 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
 ; GFX10-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_not_b64 s[4:5], s[2:3]
 ; GFX10-NEXT:    s_add_u32 s2, s2, s0
-; GFX10-NEXT:    s_addc_u32 s3, s3, s1
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_addc_u32 s3, s3, s1
 ; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    ; return to shader part epilog
   %xor = xor i64 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 78d764898a3b93..ab862c84063687 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1457,6 +1457,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1473,6 +1476,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-GISEL-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1545,6 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1561,6 +1570,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1633,6 +1645,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1649,6 +1664,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GFX10-GISEL-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1725,6 +1743,9 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
 ; GFX8-GISEL-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
 ; GFX8-GISEL-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX8-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX8-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, s1
@@ -1742,6 +1763,9 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
 ; GFX10-GISEL-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
 ; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-GISEL-NEXT:    s_add_u32 s0, s0, 10
+; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 1b35a89ad7f935..dbfed79ad62b47 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -282,8 +282,14 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
 ; GISEL-NEXT:    s_add_u32 s2, s4, s6
+; GISEL-NEXT:    s_cselect_b32 s3, 1, 0
+; GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GISEL-NEXT:    s_addc_u32 s3, s5, s7
 ; GISEL-NEXT:    s_add_u32 s0, s2, s0
+; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GISEL-NEXT:    s_addc_u32 s1, s3, s1
 ; GISEL-NEXT:    ; return to shader part epilog
   %and = and i64 %b, 63
@@ -313,8 +319,14 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
 ; GISEL-NEXT:    s_add_u32 s2, s4, s6
+; GISEL-NEXT:    s_cselect_b32 s3, 1, 0
+; GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GISEL-NEXT:    s_addc_u32 s3, s5, s7
 ; GISEL-NEXT:    s_add_u32 s0, s2, s0
+; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GISEL-NEXT:    s_addc_u32 s1, s3, s1
 ; GISEL-NEXT:    ; return to shader part epilog
   %and = and i64 %b, 255
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 1ba5e8f916cbaa..e479dd0c3f965a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -228,12 +228,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-SAFE-GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
 ; VI-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
+; VI-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
 ; VI-SAFE-GISEL-NEXT:    s_bitset1_b32 s2, 12
@@ -243,8 +245,12 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-SAFE-GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s8, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-SAFE-GISEL-NEXT:    s_and_b32 s7, s7, 1
+; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, 0
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
 ; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
@@ -256,8 +262,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
 ; VI-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
+; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-SAFE-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
 ; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
 ; VI-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
@@ -358,23 +370,29 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
+; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
+; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
+; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s6
+; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s6, s8, s7
 ; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s6, s7
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s8
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
@@ -386,8 +404,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
@@ -500,25 +524,34 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
+; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s6
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s6, s8, s7
 ; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s6, s7
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s8
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s7, s7, 1
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
 ; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
@@ -528,15 +561,23 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s6, 1
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-SAFE-GISEL-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
index 5dbfdf24ef36f7..548a8ce0f9d2fb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -106,19 +106,36 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
 }
 
 define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e64 s0, 12, v0
-; CHECK-NEXT:    s_mov_b32 s1, 0
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT:  ; %bb.1: ; %true
-; CHECK-NEXT:    s_mov_b32 s0, 42
-; CHECK-NEXT:    s_branch .LBB7_3
-; CHECK-NEXT:  .LBB7_2: ; %false
-; CHECK-NEXT:    s_mov_b32 s0, 33
-; CHECK-NEXT:    s_branch .LBB7_3
-; CHECK-NEXT:  .LBB7_3:
+; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    v_cmp_gt_u32_e64 s0, 12, v0
+; DAGISEL-NEXT:    s_mov_b32 s1, 0
+; DAGISEL-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; DAGISEL-NEXT:  ; %bb.1: ; %true
+; DAGISEL-NEXT:    s_mov_b32 s0, 42
+; DAGISEL-NEXT:    s_branch .LBB7_3
+; DAGISEL-NEXT:  .LBB7_2: ; %false
+; DAGISEL-NEXT:    s_mov_b32 s0, 33
+; DAGISEL-NEXT:    s_branch .LBB7_3
+; DAGISEL-NEXT:  .LBB7_3:
+;
+; GISEL-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_cmp_gt_u32_e64 s0, 12, v0
+; GISEL-NEXT:    s_mov_b32 s1, 0
+; GISEL-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GISEL-NEXT:    s_cbranch_scc1 .LBB7_2
+; GISEL-NEXT:  ; %bb.1: ; %true
+; GISEL-NEXT:    s_mov_b32 s0, 42
+; GISEL-NEXT:    s_branch .LBB7_3
+; GISEL-NEXT:  .LBB7_2: ; %false
+; GISEL-NEXT:    s_mov_b32 s0, 33
+; GISEL-NEXT:    s_branch .LBB7_3
+; GISEL-NEXT:  .LBB7_3:
   %c = icmp ult i32 %v, 12
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
   %ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -155,6 +172,9 @@ define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) {
 ; GISEL-NEXT:    s_mov_b32 s1, 0
 ; GISEL-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GISEL-NEXT:    s_cmp_eq_u64 s[0:1], 0
+; GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-NEXT:    s_cbranch_scc1 .LBB8_2
 ; GISEL-NEXT:  ; %bb.1: ; %true
 ; GISEL-NEXT:    s_mov_b32 s0, 42
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
index 3781faa54e7dc6..a395239b30b44a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -96,18 +96,34 @@ entry:
 
 ; Test ballot after phi
 define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) {
-; GFX11-LABEL: phi_uniform:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc1 .LBB5_2
-; GFX11-NEXT:  ; %bb.1: ; %if
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:  .LBB5_2: ; %endif
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
-; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GISEL-LABEL: phi_uniform:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GISEL-NEXT:    s_cbranch_scc1 .LBB5_2
+; GISEL-NEXT:  ; %bb.1: ; %if
+; GISEL-NEXT:    s_add_i32 s0, s0, 1
+; GISEL-NEXT:  .LBB5_2: ; %endif
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT:    s_endpgm
+;
+; SDAG-LABEL: phi_uniform:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; SDAG-NEXT:    s_cbranch_scc1 .LBB5_2
+; SDAG-NEXT:  ; %bb.1: ; %if
+; SDAG-NEXT:    s_add_i32 s0, s0, 1
+; SDAG-NEXT:  .LBB5_2: ; %endif
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; SDAG-NEXT:    global_store_b32 v[0:1], v2, off
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT:    s_endpgm
 entry:
   %cc = icmp ne i32 %s2, 0
   br i1 %cc, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index 29218a3625216a..7ad75436dceaac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -165,9 +165,15 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; GISEL-LABEL: phi_uniform:
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GISEL-NEXT:    s_cbranch_scc1 .LBB5_2
 ; GISEL-NEXT:  ; %bb.1: ; %if
 ; GISEL-NEXT:    s_add_u32 s0, s0, 1
+; GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:  .LBB5_2: ; %endif
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
@@ -220,6 +226,9 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr
 ; GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
 ; GISEL-NEXT:  ; %bb.1: ; %if
 ; GISEL-NEXT:    s_add_u32 s0, s0, 1
+; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:  ; %bb.2: ; %endif
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 42e8b2608dc1c0..61ea01b85981fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -170,6 +170,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; CI-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x32
 ; CI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-GISEL-NEXT:    s_cmp_lg_u32 s1, s0
+; CI-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; CI-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; CI-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; CI-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; CI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -184,6 +187,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -198,6 +204,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX10-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -212,6 +221,10 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index f8e60e5eb09a16..58929c2fb2d17b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -237,6 +237,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; CI-GISEL-NEXT:    s_load_dword s0, s[6:7], 0x33
 ; CI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-GISEL-NEXT:    s_cmp_lg_u32 s1, s0
+; CI-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; CI-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; CI-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; CI-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; CI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -251,6 +254,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX9-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -265,6 +271,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX10-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %bb0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
@@ -279,6 +288,10 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    s_mov_b64 s[2:3], src_shared_base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s1, s3
+; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
index dbe95a8091932a..0e509c0840c123 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
@@ -36,25 +36,41 @@ define amdgpu_ps void @test(ptr addrspace(1) inreg %ptr) {
 }
 
 define amdgpu_ps void @test_loop() {
-; GFX9-LABEL: test_loop:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:  .LBB1_1: ; %loop
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
-; GFX9-NEXT:  ; %bb.2: ; %exit
-; GFX9-NEXT:    s_endpgm
+; SDAG-LABEL: test_loop:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:  .LBB1_1: ; %loop
+; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; SDAG-NEXT:    s_cmp_eq_u32 s0, 0
+; SDAG-NEXT:    s_cbranch_scc1 .LBB1_1
+; SDAG-NEXT:  ; %bb.2: ; %exit
+; SDAG-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: test_loop:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:  .LBB1_1: ; %loop
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
-; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cbranch_scc1 .LBB1_1
-; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: test_loop:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:  .LBB1_1: ; %loop
+; GFX9-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9-GISEL-NEXT:  ; %bb.2: ; %exit
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: test_loop:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:  .LBB1_1: ; %loop
+; GFX10-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT:    s_mov_b32 s0, src_pops_exiting_wave_id
+; GFX10-GISEL-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX10-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX10-GISEL-NEXT:  ; %bb.2: ; %exit
+; GFX10-GISEL-NEXT:    s_endpgm
   br label %loop
 loop:
   %id = call i32 @llvm.amdgcn.pops.exiting.wave.id()
@@ -183,3 +199,4 @@ define amdgpu_ps void @test_call(ptr addrspace(1) inreg %ptr) {
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10-SDAG: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index a642543c3780db..e8425ab5d8d3cd 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -3,20 +3,39 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
-; GFX12-LABEL: v_s_exp_f32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
-; GFX12-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: v_s_exp_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42800000, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
+; GFX12-SDAG-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: v_s_exp_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s2, 0x42800000, 0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
+; GFX12-GISEL-NEXT:    s_add_f32 s0, s0, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.exp2.f32(float %src)
   ret float %result
 }
@@ -55,20 +74,39 @@ define amdgpu_cs half @v_s_amdgcn_exp_f16(half inreg %src) {
 }
 
 define amdgpu_cs float @v_s_log_f32(float inreg %src) {
-; GFX12-LABEL: v_s_log_f32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT:    v_s_log_f32 s0, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT:    s_sub_f32 s0, s0, s1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: v_s_log_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0x800000
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT:    v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: v_s_log_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s2
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.log2.f32(float %src)
   ret float %result
 }
@@ -205,24 +243,34 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
 ; GFX12-GISEL-NEXT:    s_cmp_gt_f32 0xf800000, s0
 ; GFX12-GISEL-NEXT:    s_mul_f32 s2, s0, 0x4f800000
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(TRANS32_DEP_1)
 ; GFX12-GISEL-NEXT:    v_s_sqrt_f32 s2, s0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s0
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_mov_b32 s7, s0
 ; GFX12-GISEL-NEXT:    s_add_co_i32 s3, s2, -1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_xor_b32 s5, s3, 0x80000000
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_fmac_f32 s4, s5, s2
 ; GFX12-GISEL-NEXT:    s_add_co_i32 s5, s2, 1
-; GFX12-GISEL-NEXT:    s_xor_b32 s7, s5, 0x80000000
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_xor_b32 s6, s5, 0x80000000
 ; GFX12-GISEL-NEXT:    s_cmp_le_f32 s4, 0
-; GFX12-GISEL-NEXT:    s_fmac_f32 s6, s7, s2
+; GFX12-GISEL-NEXT:    s_fmac_f32 s7, s6, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX12-GISEL-NEXT:    s_cmp_gt_f32 s6, 0
+; GFX12-GISEL-NEXT:    s_cmp_gt_f32 s7, 0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX12-GISEL-NEXT:    s_mul_f32 s3, s2, 0x37800000
@@ -271,22 +319,42 @@ define amdgpu_cs half @v_amdgcn_sqrt_f16(half inreg %src)  {
 }
 
 define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
-; GFX12-LABEL: srcmods_abs_f32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_bitset0_b32 s0, 31
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-NEXT:    v_s_log_f32 s0, s0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_sub_f32 s0, s0, s1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-LABEL: srcmods_abs_f32:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0x800000
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-SDAG-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-SDAG-NEXT:    v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: srcmods_abs_f32:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s2, 0x4f800000, 1.0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s2
+; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %abs = call float @llvm.fabs.f32(float %src)
   %result = call float @llvm.log2.f32(float %abs)
   ret float %result
@@ -314,15 +382,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ; GFX12-GISEL-NEXT:    s_xor_b32 s0, s0, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s1, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s2, 0x4f800000, 1.0
 ; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s2
 ; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
 ; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %neg = fneg float %src
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
index cdaac14833e0ef..090bdfc4c8d192 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
@@ -22,6 +22,10 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GISEL-NEXT:  ; %bb.1: ; %bb2
 ; GISEL-NEXT:    scratch_load_b32 v0, v0, off offset:-4



More information about the llvm-commits mailing list