[llvm] [GlobalIsel] Combine G_SELECT (PR #74845)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 8 06:53:28 PST 2023


https://github.com/tschuett created https://github.com/llvm/llvm-project/pull/74845

Cleanups, preparation for more combines, add known bits for constant conditions, combine selects where the false and true register are constants, and improve support for vector conditions.

AMDGPU supports vector conditions. X86 has a todo for vector conditions. AArch64 SVE supports SEL for vector conditions. How to implement vector conditions with NEON (with bsl), see arm64-vselect.ll ? Vector select asserts in the instruction selector.

buildNot does not support scalable vectors. We cannot create scalable constant vectors of -1 and there is no G_Not. AArch64 SVE has a NOT and a DUP for broadcasting. Something akin to G_CONSTANT_SPLAT, G_CONSTANT_VECTOR, G_SPLAT_VECTOR, G_BRODCAST, or G_HOMOGENOUS_VECTOR that takes an immediate and creates a (fixed or scalable) vector where all elements are the immediate might solve the buildNot challenge, facilitates new combines, pattern matching, and new selecting optimizations.

P.S. We need to support integer and float.

https://github.com/llvm/llvm-project/pull/74502

```c
<vscale x 4 x i32> splat (i32 -1)
```

>From 3fbe241d38a0fe11112caef03d7708a9adb939bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Thu, 7 Dec 2023 16:56:35 +0100
Subject: [PATCH] [GlobalIsel] Combine G_SELECT

Cleanups, preparation for more combines, add known bits for constant
conditions, combine selects where the false and true register are
constants, and improve support for vector conditions.

AMDGPU supports vector conditions. X86 has a todo for vector
conditions. AArch64 SVE supports SEL for vector conditions. How to
implement vector conditions with NEON (with bsl), see arm64-vselect.ll
? Vector select asserts in the instruction selector.

buildNot does not support scalable vectors. We cannot create scalable
constant vectors of -1 and there is no G_Not. AArch64 SVE has a NOT
and a DUP for broadcasting. Something akin to G_CONSTANT_SPLAT,
G_CONSTANT_VECTOR, G_SPLAT_VECTOR, G_BRODCAST, or G_HOMOGENOUS_VECTOR
that takes an immediate and creates a (fixed or scalable) vector where
all elements are the immediate might solve the buildNot challenge,
facilitates new combines, pattern matching, and new selecting
optimizations.

P.S. We need to support integer and float.

https://github.com/llvm/llvm-project/pull/74502

```c
<vscale x 4 x i32> splat (i32 -1)
```
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   28 +-
 .../CodeGen/GlobalISel/MachineIRBuilder.h     |   10 +-
 .../include/llvm/Target/GlobalISel/Combine.td |   34 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  412 ++-
 .../CodeGen/GlobalISel/MachineIRBuilder.cpp   |    9 +
 .../AArch64/GlobalISel/combine-select.mir     |  265 +-
 .../postlegalizercombiner-select.mir          |    2 +-
 llvm/test/CodeGen/AArch64/andcompare.ll       |   14 +-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |  124 +-
 llvm/test/CodeGen/AArch64/call-rv-marker.ll   |  447 ++-
 .../AArch64/neon-bitwise-instructions.ll      |   20 +-
 .../stack-probing-dynamic-no-frame-setup.ll   |    6 +-
 .../combine-fold-binop-into-select.mir        |   42 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 2192 ++++++++-------
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 2416 ++++++++---------
 .../GlobalISel/llvm.amdgcn.wqm.demote.ll      |   48 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   77 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  349 ++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  111 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |  894 +++---
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |   32 +-
 llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll    |   14 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |   46 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |  199 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |  498 ++--
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll  |    8 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  157 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |   40 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |   40 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |   14 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |  950 +++----
 31 files changed, 5247 insertions(+), 4251 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index a4e9c92b48976..f73e4ae7944df 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -423,16 +423,9 @@ class CombinerHelper {
   /// Return true if a G_STORE instruction \p MI is storing an undef value.
   bool matchUndefStore(MachineInstr &MI);
 
-  /// Return true if a G_SELECT instruction \p MI has an undef comparison.
-  bool matchUndefSelectCmp(MachineInstr &MI);
-
   /// Return true if a G_{EXTRACT,INSERT}_VECTOR_ELT has an out of range index.
   bool matchInsertExtractVecEltOutOfBounds(MachineInstr &MI);
 
-  /// Return true if a G_SELECT instruction \p MI has a constant comparison. If
-  /// true, \p OpIdx will store the operand index of the known selected value.
-  bool matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx);
-
   /// Replace an instruction with a G_FCONSTANT with value \p C.
   void replaceInstWithFConstant(MachineInstr &MI, double C);
 
@@ -771,9 +764,6 @@ class CombinerHelper {
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
                                                 BuildFnTy &MatchInfo);
 
-  /// Fold boolean selects to logical operations.
-  bool matchSelectToLogical(MachineInstr &MI, BuildFnTy &MatchInfo);
-
   bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info);
 
   /// Transform G_ADD(x, G_SUB(y, x)) to y.
@@ -816,6 +806,9 @@ class CombinerHelper {
   // Given a binop \p MI, commute operands 1 and 2.
   void applyCommuteBinOpOperands(MachineInstr &MI);
 
+  // Combine selects.
+  bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -906,6 +899,21 @@ class CombinerHelper {
   /// select (fcmp uge x, 1.0) 1.0, x -> fminnm x, 1.0
   bool matchFPSelectToMinMax(Register Dst, Register Cond, Register TrueVal,
                              Register FalseVal, BuildFnTy &MatchInfo);
+
+  bool isOneOrOneSplat(Register Src, bool AllowUndefs);
+  bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
+  bool isConstantSplatVector(Register Src, int64_t SplatValue,
+                             bool AllowUndefs);
+  std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
+
+  /// Try to combine selects with constant conditions.
+  bool tryCombineSelectConstantCondition(GSelect *Select, BuildFnTy &MatchInfo);
+
+  /// Try to combine selects with boolean conditions to logical operators.
+  bool tryFoldBoolSelectToLogic(GSelect *Select, BuildFnTy &MatchInfo);
+
+  /// Try to combine selects where the true and false values are constant.
+  bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo);
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index e0101a5ac1ca8..7dec611c3e27e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1701,11 +1701,11 @@ class MachineIRBuilder {
 
   /// Build and insert a bitwise not,
   /// \p NegOne = G_CONSTANT -1
-  /// \p Res = G_OR \p Op0, NegOne
-  MachineInstrBuilder buildNot(const DstOp &Dst, const SrcOp &Src0) {
-    auto NegOne = buildConstant(Dst.getLLTTy(*getMRI()), -1);
-    return buildInstr(TargetOpcode::G_XOR, {Dst}, {Src0, NegOne});
-  }
+  /// \p Res = G_XOR \p Op0, NegOne
+  /// Or
+  /// \p NegOne = G_BUILD_VECTOR -1, -1, -1, ...
+  /// \p Res = G_XOR \p Op0, NegOne
+  MachineInstrBuilder buildNot(const DstOp &Dst, const SrcOp &Src0);
 
   /// Build and insert integer negation
   /// \p Zero = G_CONSTANT 0
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 77db371adaf77..5444c368e598b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -419,31 +419,6 @@ def select_same_val: GICombineRule<
   (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, 2); }])
 >;
 
-// Fold (undef ? x : y) -> y
-def select_undef_cmp: GICombineRule<
-  (defs root:$dst),
-  (match (G_IMPLICIT_DEF $undef),
-         (G_SELECT $dst, $undef, $x, $y)),
-  (apply (GIReplaceReg $dst, $y))
->;
-
-// Fold (true ? x : y) -> x
-// Fold (false ? x : y) -> y
-def select_constant_cmp_matchdata : GIDefMatchData<"unsigned">;
-def select_constant_cmp: GICombineRule<
-  (defs root:$root, select_constant_cmp_matchdata:$matchinfo),
-  (match (wip_match_opcode G_SELECT):$root,
-    [{ return Helper.matchConstantSelectCmp(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
->;
-
-def select_to_logical : GICombineRule<
-  (defs root:$root, build_fn_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_SELECT):$root,
-    [{ return Helper.matchSelectToLogical(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
->;
-
 // Fold (C op x) -> (x op C)
 // TODO: handle more isCommutable opcodes
 // TODO: handle compares (currently not marked as isCommutable)
@@ -1242,6 +1217,12 @@ def select_to_minmax: GICombineRule<
          [{ return Helper.matchSimplifySelectToMinMax(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def match_selects : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+        [{ return Helper.matchSelect(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1281,8 +1262,7 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
-def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
-                                      select_to_logical]>;
+def select_combines : GICombineGroup<[match_selects]>;
 
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one, idempotent_prop]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 91a64d59e154d..b12f83e75859a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2611,12 +2611,6 @@ bool CombinerHelper::matchUndefStore(MachineInstr &MI) {
                       MRI);
 }
 
-bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
-  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(1).getReg(),
-                      MRI);
-}
-
 bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) {
   assert((MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT ||
           MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) &&
@@ -2630,16 +2624,6 @@ bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) {
   return Idx->getZExtValue() >= VecTy.getNumElements();
 }
 
-bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) {
-  GSelect &SelMI = cast<GSelect>(MI);
-  auto Cst =
-      isConstantOrConstantSplatVector(*MRI.getVRegDef(SelMI.getCondReg()), MRI);
-  if (!Cst)
-    return false;
-  OpIdx = Cst->isZero() ? 3 : 2;
-  return true;
-}
-
 void CombinerHelper::eraseInst(MachineInstr &MI) { MI.eraseFromParent(); }
 
 bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
@@ -5940,62 +5924,6 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   return false;
 }
 
-bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo) {
-  GSelect &Sel = cast<GSelect>(MI);
-  Register DstReg = Sel.getReg(0);
-  Register Cond = Sel.getCondReg();
-  Register TrueReg = Sel.getTrueReg();
-  Register FalseReg = Sel.getFalseReg();
-
-  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
-  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
-
-  const LLT CondTy = MRI.getType(Cond);
-  const LLT OpTy = MRI.getType(TrueReg);
-  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
-    return false;
-
-  // We have a boolean select.
-
-  // select Cond, Cond, F --> or Cond, F
-  // select Cond, 1, F    --> or Cond, F
-  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
-  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, Cond, FalseReg);
-    };
-    return true;
-  }
-
-  // select Cond, T, Cond --> and Cond, T
-  // select Cond, T, 0    --> and Cond, T
-  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
-  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, Cond, TrueReg);
-    };
-    return true;
-  }
-
- // select Cond, T, 1 --> or (not Cond), T
-  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
-    };
-    return true;
-  }
-
-  // select Cond, 0, F --> and (not Cond), F
-  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
-    };
-    return true;
-  }
-  return false;
-}
-
 bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
                                             unsigned &IdxToPropagate) {
   bool PropagateNaN;
@@ -6318,3 +6246,343 @@ void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   MI.getOperand(2).setReg(LHSReg);
   Observer.changedInstr(MI);
 }
+
+bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 1, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 1;
+  }
+  return false; // scalable vector
+}
+
+bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 0, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 0;
+  }
+  return false; // scalable vector
+}
+
+// Ignores COPYs during conformance checks.
+// FIXME scalable vectors.
+bool CombinerHelper::isConstantSplatVector(Register Src, int64_t SplatValue,
+                                           bool AllowUndefs) {
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  for (unsigned I = 0; I < NumSources; ++I) {
+    GImplicitDef *ImplicitDef =
+        getOpcodeDef<GImplicitDef>(BuildVector->getSourceReg(I), MRI);
+    if (ImplicitDef && AllowUndefs)
+      continue;
+    if (ImplicitDef && !AllowUndefs)
+      return false;
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (IConstant && IConstant->Value == SplatValue)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+// Ignores COPYs during lookups.
+// FIXME scalable vectors
+std::optional<APInt>
+CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return IConstant->Value;
+
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return std::nullopt;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  std::optional<APInt> Value = std::nullopt;
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return std::nullopt;
+    if (!Value)
+      Value = IConstant->Value;
+    else if (*Value != IConstant->Value)
+      return std::nullopt;
+  }
+  return Value;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register Dest = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // Either both are scalars or both are vectors.
+  std::optional<APInt> TrueOpt = getConstantOrConstantSplatVector(True);
+  std::optional<APInt> FalseOpt = getConstantOrConstantSplatVector(False);
+
+  if (!TrueOpt || !FalseOpt)
+    return false;
+
+  // These are only the splat values.
+  APInt TrueValue = *TrueOpt;
+  APInt FalseValue = *FalseOpt;
+
+  // Boolean or fixed vector of booleans.
+  if (CondTy.isScalableVector() ||
+      (CondTy.isFixedVector() &&
+       CondTy.getElementType().getScalarSizeInBits() != 1) ||
+      CondTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // select Cond, 1, 0 --> zext (Cond)
+  if (TrueValue.isOne() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildZExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, -1, 0 --> sext (Cond)
+  if (TrueValue.isAllOnes() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildSExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, 0, 1 --> zext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isOne()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildZExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, 0, -1 --> sext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildSExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+  if (TrueValue - 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+  if (TrueValue + 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+  if (TrueValue.isPowerOf2() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      // The shift amount must be scalar.
+      LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy;
+      auto ShAmtC = B.buildConstant(ShiftTy, TrueValue.exactLogBase2());
+      B.buildShl(Dest, Inner, ShAmtC, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, -1, C --> or (sext Cond), C
+  if (TrueValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildOr(Dest, Inner, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, C, -1 --> or (sext (not Cond)), C
+  if (FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Not = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Not, Cond);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Not);
+      B.buildOr(Dest, Inner, True, Flags);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register DstReg = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // Boolean or fixed vector of booleans.
+  if (CondTy.isScalableVector() ||
+      (CondTy.isFixedVector() &&
+       CondTy.getElementType().getScalarSizeInBits() != 1) ||
+      CondTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  if ((Cond == True) || isOneOrOneSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildOr(DstReg, Ext, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  if ((Cond == False) || isZeroOrZeroSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildAnd(DstReg, Ext, True);
+    };
+    return true;
+  }
+
+  // select Cond, T, 1 --> or (not Cond), T
+  if (isOneOrOneSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildOr(DstReg, Ext, True, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (isZeroOrZeroSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildAnd(DstReg, Ext, False);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::tryCombineSelectConstantCondition(GSelect *Select,
+                                                       BuildFnTy &MatchInfo) {
+  Register Dest = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+
+  KnownBits Known = KB->getKnownBits(Cond);
+  if (Known.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildCopy(Dest, False);
+    };
+    return true;
+  } else if (CondTy.isScalar() && Known.isStrictlyPositive()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildCopy(Dest, True);
+    };
+    return true;
+  }
+
+  if (isOneOrOneSplat(Cond, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildCopy(Dest, True);
+    };
+    return true;
+  } else if (isZeroOrZeroSplat(Cond, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildCopy(Dest, False);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  GSelect *Select = cast<GSelect>(&MI);
+
+  if (tryCombineSelectConstantCondition(Select, MatchInfo))
+    return true;
+
+  if (tryFoldSelectOfConstants(Select, MatchInfo))
+    return true;
+
+  if (tryFoldBoolSelectToLogic(Select, MatchInfo))
+    return true;
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 80e9c08e850b6..a34ca1385925a 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -1341,3 +1341,12 @@ MachineIRBuilder::buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
     MIB->setFlags(*Flags);
   return MIB;
 }
+
+MachineInstrBuilder MachineIRBuilder::buildNot(const DstOp &Dst,
+                                               const SrcOp &Src0) {
+  assert(!Dst.getLLTTy(*getMRI()).isScalableVector() &&
+         "unexpected scalable vector in buildNot");
+
+  auto NegOne = buildConstant(Dst.getLLTTy(*getMRI()), -1);
+  return buildXor(Dst, Src0, NegOne);
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 81d38a5b08047..e498de1b43f90 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -27,7 +27,7 @@ body:             |
     ; CHECK-LABEL: name: test_combine_select_undef_res0_res1
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: $x0 = COPY [[COPY]](s64)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
@@ -253,9 +253,7 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
-    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], %t
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -298,3 +296,262 @@ body:             |
     %ext:_(s32) = G_ANYEXT %sel
     $w0 = COPY %ext(s32)
 ...
+---
+# select 0+1, T, F --> T
+name:            bool_add_0_1_true_false_known_bits
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: bool_add_0_1_true_false_known_bits
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %t(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %add:_(s1) = G_ADD %zero, %one
+    %sel:_(s1) = G_SELECT %add, %t, %f
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 1, 0 --> zext(Cond)
+name:            select_cond_1_0_to_zext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_0_to_zext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %one, %zero
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, 1 --> zext(!Cond)
+name:            select_cond_0_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %zero, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 2, 1 --> and (zext Cond), false
+name:            select_cond_2_1_to_and_zext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_2_1_to_and_zext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[ZEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 102
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 1, 2 --> and (ext Cond), false
+name:            select_cond_1_2_to_and_sext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_2_to_and_sext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 102
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 102
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 64, 0 --> (zext Cond) << log2(Pow2)
+name:            select_cond_64_0_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_64_0_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 64
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 0 --> sext Cond
+name:            select_cond_minus_1_0_to_sext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_0_to_sext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, -1 --> sext (!Cond)
+name:            select_cond_0_minus_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_minus_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 101 --> or (sext Cond), 101
+name:            select_cond_minus_1_101_to_or_sext_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_101_to_or_sext_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+# select cond, 101, -1 --> or (sext (not Cond), 101
+name:            select_cond_101_minus_1_to_or_sext_not_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-select.mir
index ac0d561c99a1b..c07e25f716486 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-select.mir
@@ -27,7 +27,7 @@ body:             |
     ; CHECK-LABEL: name: test_combine_select_undef_res0_res1
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: $x0 = COPY [[COPY]](s64)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
diff --git a/llvm/test/CodeGen/AArch64/andcompare.ll b/llvm/test/CodeGen/AArch64/andcompare.ll
index 9a7fa04982990..18715a2697339 100644
--- a/llvm/test/CodeGen/AArch64/andcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andcompare.ll
@@ -2414,7 +2414,8 @@ define i32 @cmp_to_ands1(i32 %num) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    and w8, w0, #0xff
 ; GISEL-NEXT:    cmp w8, #1
-; GISEL-NEXT:    csel w0, w8, wzr, hi
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 255
   %cmp = icmp ugt i32 %and, 1
@@ -2434,7 +2435,8 @@ define i32 @cmp_to_ands2(i32 %num) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    and w8, w0, #0xfe
 ; GISEL-NEXT:    cmp w8, #63
-; GISEL-NEXT:    csel w0, w8, wzr, hi
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 254
   %cmp = icmp ugt i32 %and, 63
@@ -2451,10 +2453,11 @@ define i32 @cmp_to_ands3(i32 %num, i32 %a) {
 ;
 ; GISEL-LABEL: cmp_to_ands3:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #23
+; GISEL-NEXT:    mov w8, #23 // =0x17
 ; GISEL-NEXT:    and w8, w0, w8
 ; GISEL-NEXT:    cmp w8, #7
-; GISEL-NEXT:    csel w0, w1, wzr, hi
+; GISEL-NEXT:    cset w8, hi
+; GISEL-NEXT:    and w0, w8, w1
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 23
   %cmp = icmp ugt i32 %and, 7
@@ -2514,7 +2517,8 @@ define i32 @cmp_to_ands6(i32 %num) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    and w8, w0, #0xfe
 ; GISEL-NEXT:    cmp w8, #16
-; GISEL-NEXT:    csel w0, w8, wzr, hs
+; GISEL-NEXT:    cset w9, hs
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 254
   %cmp = icmp uge i32 %and, 16
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 821f6e403a271..5b6560084095b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -14,7 +14,7 @@ define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB0_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -42,7 +42,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB1_2: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_different:
@@ -55,7 +55,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB1_2: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 5
@@ -88,7 +88,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB2_3: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_flagclobber:
@@ -106,7 +106,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB2_3: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -144,7 +144,7 @@ define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB3_3: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -178,13 +178,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    ccmp w8, #16, #0, ge
 ; SDISEL-NEXT:    b.le LBB4_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB4_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: speculate_division:
@@ -194,13 +194,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    ccmp w8, #17, #0, gt
 ; GISEL-NEXT:    b.lt LBB4_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB4_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -230,13 +230,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; SDISEL-NEXT:    fccmp s0, s1, #8, ge
 ; SDISEL-NEXT:    b.ge LBB5_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB5_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_fcmp:
@@ -248,13 +248,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; GISEL-NEXT:    fccmp s0, s1, #8, gt
 ; GISEL-NEXT:    b.ge LBB5_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB5_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -318,7 +318,7 @@ define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB7_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -346,13 +346,13 @@ define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    cmp w1, #32
 ; CHECK-NEXT:    b.eq LBB8_3
 ; CHECK-NEXT:  ; %bb.2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB8_3: ; %if.then
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -380,7 +380,7 @@ define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB9_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -408,7 +408,7 @@ define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB10_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -466,7 +466,7 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #0, ne
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -488,7 +488,7 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -510,7 +510,7 @@ define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) {
 ;
 ; GISEL-LABEL: select_or_float:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    fcsel s0, s0, s1, lt
@@ -528,17 +528,22 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ; SDISEL-NEXT:    cmp x0, #2
 ; SDISEL-NEXT:    ccmp x0, #4, #4, ne
 ; SDISEL-NEXT:    ccmp x1, #0, #0, eq
-; SDISEL-NEXT:    mov w8, #1
+; SDISEL-NEXT:    mov w8, #1 ; =0x1
 ; SDISEL-NEXT:    cinc x0, x8, eq
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #2
+; GISEL-NEXT:    cmp x1, #0
+; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    ccmp x0, #4, #4, ne
-; GISEL-NEXT:    ccmp x1, #0, #0, eq
-; GISEL-NEXT:    csinc x0, x8, xzr, eq
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cmp x0, #4
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    orr w9, w10, w9
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    and x8, x8, #0x1
+; GISEL-NEXT:    add x0, x8, #1
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -552,14 +557,30 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 }
 
 define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
-; CHECK-LABEL: select_ororand:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmp w3, #4
-; CHECK-NEXT:    ccmp w2, #2, #0, gt
-; CHECK-NEXT:    ccmp w1, #13, #2, ge
-; CHECK-NEXT:    ccmp w0, #0, #4, ls
-; CHECK-NEXT:    csel w0, w3, wzr, eq
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: select_ororand:
+; SDISEL:       ; %bb.0:
+; SDISEL-NEXT:    cmp w3, #4
+; SDISEL-NEXT:    ccmp w2, #2, #0, gt
+; SDISEL-NEXT:    ccmp w1, #13, #2, ge
+; SDISEL-NEXT:    ccmp w0, #0, #4, ls
+; SDISEL-NEXT:    csel w0, w3, wzr, eq
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: select_ororand:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cmp w1, #13
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    cmp w2, #2
+; GISEL-NEXT:    cset w10, lt
+; GISEL-NEXT:    cmp w3, #4
+; GISEL-NEXT:    cset w11, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, w3
+; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %w0, 0
   %c1 = icmp ugt i32 %w1, 13
   %c2 = icmp slt i32 %w2, 2
@@ -592,7 +613,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ; SDISEL-LABEL: select_andor32:
 ; SDISEL:       ; %bb.0:
 ; SDISEL-NEXT:    cmp w1, w2
-; SDISEL-NEXT:    mov w8, #32
+; SDISEL-NEXT:    mov w8, #32 ; =0x20
 ; SDISEL-NEXT:    ccmp w0, w8, #4, lt
 ; SDISEL-NEXT:    ccmp w0, w1, #0, eq
 ; SDISEL-NEXT:    csel w0, w0, w1, eq
@@ -600,7 +621,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #32
+; GISEL-NEXT:    mov w8, #32 ; =0x20
 ; GISEL-NEXT:    cmp w1, w2
 ; GISEL-NEXT:    ccmp w0, w8, #4, lt
 ; GISEL-NEXT:    ccmp w0, w1, #0, eq
@@ -631,18 +652,18 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; GISEL-LABEL: select_noccmp1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    cmp x0, #0
-; GISEL-NEXT:    cset w8, lt
+; GISEL-NEXT:    cset w8, ge
 ; GISEL-NEXT:    cmp x0, #13
-; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    cset w9, le
 ; GISEL-NEXT:    cmp x2, #2
-; GISEL-NEXT:    cset w10, lt
+; GISEL-NEXT:    cset w10, ge
 ; GISEL-NEXT:    cmp x2, #4
-; GISEL-NEXT:    cset w11, gt
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    cset w11, le
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, xzr, x3, ne
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and x8, x8, #0x1
+; GISEL-NEXT:    and x0, x8, x3
 ; GISEL-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
@@ -677,11 +698,12 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; GISEL-NEXT:    cmp x0, #13
 ; GISEL-NEXT:    cset w9, gt
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, xzr, x3, ne
+; GISEL-NEXT:    eor w9, w8, #0x1
+; GISEL-NEXT:    and x9, x9, #0x1
 ; GISEL-NEXT:    sbfx w8, w8, #0, #1
-; GISEL-NEXT:    adrp x9, _g at PAGE
-; GISEL-NEXT:    str w8, [x9, _g at PAGEOFF]
+; GISEL-NEXT:    adrp x10, _g at PAGE
+; GISEL-NEXT:    str w8, [x10, _g at PAGEOFF]
+; GISEL-NEXT:    and x0, x9, x3
 ; GISEL-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
@@ -701,11 +723,11 @@ define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
 ; SDISEL-NEXT:    ccmp w0, #13, #0, ge
 ; SDISEL-NEXT:    cset w8, gt
 ; SDISEL-NEXT:    cmp w0, #22
-; SDISEL-NEXT:    mov w9, #44
+; SDISEL-NEXT:    mov w9, #44 ; =0x2c
 ; SDISEL-NEXT:    ccmp w0, w9, #0, ge
 ; SDISEL-NEXT:    csel w8, wzr, w8, le
 ; SDISEL-NEXT:    cmp w0, #99
-; SDISEL-NEXT:    mov w9, #77
+; SDISEL-NEXT:    mov w9, #77 ; =0x4d
 ; SDISEL-NEXT:    ccmp w0, w9, #4, ne
 ; SDISEL-NEXT:    cset w9, eq
 ; SDISEL-NEXT:    tst w8, w9
diff --git a/llvm/test/CodeGen/AArch64/call-rv-marker.ll b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
index fc06809ad09fb..de8f5bbfb484d 100644
--- a/llvm/test/CodeGen/AArch64/call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -o - %s | FileCheck --check-prefix=SELDAG --check-prefix=CHECK %s
 ; RUN: llc -global-isel -o - %s | FileCheck --check-prefix=GISEL --check-prefix=CHECK %s
 
@@ -25,37 +26,93 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 @fptr = dso_local global ptr null, align 8
 
 define dso_local ptr @rv_marker_1_retain() {
-; CHECK-LABEL: _rv_marker_1_retain:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_retain:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_retain:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local ptr @rv_marker_1_unsafeClaim() {
-; CHECK-LABEL: _rv_marker_1_unsafeClaim:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_unsafeClaim:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_unsafeClaim:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_unsafeClaimAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local void @rv_marker_2_select(i32 %c) {
-; CHECK-LABEL: _rv_marker_2_select:
-; SELDAG:        cinc  w0, w8, eq
-; GISEL:         csinc w0, w8, wzr, eq
-; CHECK-NEXT:    bl _foo0
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:    ldp x29, x30, [sp], #16
-; CHECK-NEXT:    b _foo2
+; SELDAG-LABEL: rv_marker_2_select:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov w8, #1 ; =0x1
+; SELDAG-NEXT:    cmp w0, #0
+; SELDAG-NEXT:    cinc w0, w8, eq
+; SELDAG-NEXT:    bl _foo0
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _foo2
 ;
+; GISEL-LABEL: rv_marker_2_select:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov w8, #1 ; =0x1
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cinc w0, w8, eq
+; GISEL-NEXT:    bl _foo0
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _foo2
 entry:
   %tobool.not = icmp eq i32 %c, 0
   %.sink = select i1 %tobool.not, i32 2, i32 1
@@ -65,11 +122,121 @@ entry:
 }
 
 define dso_local void @rv_marker_3() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_3:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_3:
+; SELDAG:       Lfunc_begin0:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception0
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:  Ltmp0:
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp1:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _objc_release
+; SELDAG-NEXT:  LBB3_2: ; %lpad
+; SELDAG-NEXT:  Ltmp2:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end0:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table3:
+; SELDAG-NEXT:  Lexception0:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; SELDAG-NEXT:  Lcst_begin0:
+; SELDAG-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; SELDAG-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end0:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_3:
+; GISEL:       Lfunc_begin0:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception0
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:  Ltmp0:
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp1:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _objc_release
+; GISEL-NEXT:  LBB3_2: ; %lpad
+; GISEL-NEXT:  Ltmp2:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end0:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table3:
+; GISEL-NEXT:  Lexception0:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; GISEL-NEXT:  Lcst_begin0:
+; GISEL-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; GISEL-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end0:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   invoke void @objc_object(ptr %call) #5
@@ -87,13 +254,151 @@ lpad:                                             ; preds = %entry
 }
 
 define dso_local void @rv_marker_4() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_4:
-; CHECK:       Ltmp3:
-; CHECK-NEXT:    bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:  Ltmp4:
+; SELDAG-LABEL: rv_marker_4:
+; SELDAG:       Lfunc_begin1:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception1
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    sub sp, sp, #48
+; SELDAG-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 48
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Ltmp3:
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:  Ltmp4:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:  Ltmp6:
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp7:
+; SELDAG-NEXT:  ; %bb.2: ; %invoke.cont2
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; SELDAG-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    add sp, sp, #48
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:  LBB4_3: ; %lpad1
+; SELDAG-NEXT:  Ltmp8:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    b LBB4_5
+; SELDAG-NEXT:  LBB4_4: ; %lpad
+; SELDAG-NEXT:  Ltmp5:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:  LBB4_5: ; %ehcleanup
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end1:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table4:
+; SELDAG-NEXT:  Lexception1:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; SELDAG-NEXT:  Lcst_begin1:
+; SELDAG-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; SELDAG-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; SELDAG-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end1:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_4:
+; GISEL:       Lfunc_begin1:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception1
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    sub sp, sp, #48
+; GISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 48
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Ltmp3:
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:  Ltmp4:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:  Ltmp6:
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp7:
+; GISEL-NEXT:  ; %bb.2: ; %invoke.cont2
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #48
+; GISEL-NEXT:    ret
+; GISEL-NEXT:  LBB4_3: ; %lpad1
+; GISEL-NEXT:  Ltmp8:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    b LBB4_5
+; GISEL-NEXT:  LBB4_4: ; %lpad
+; GISEL-NEXT:  Ltmp5:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:  LBB4_5: ; %ehcleanup
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end1:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table4:
+; GISEL-NEXT:  Lexception1:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; GISEL-NEXT:  Lcst_begin1:
+; GISEL-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; GISEL-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; GISEL-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end1:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %s = alloca %struct.S, align 1
   call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %s) #2
@@ -129,11 +434,53 @@ ehcleanup:                                        ; preds = %lpad1, %lpad
 }
 
 define dso_local ptr @rv_marker_5_indirect_call() {
-; CHECK-LABEL: _rv_marker_5_indirect_call:
-; CHECK:         ldr [[ADDR:x[0-9]+]], [
-; CHECK-NEXT:    blr [[ADDR]]
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_5_indirect_call:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Lloh0:
+; SELDAG-NEXT:    adrp x8, _fptr at PAGE
+; SELDAG-NEXT:  Lloh1:
+; SELDAG-NEXT:    ldr x8, [x8, _fptr at PAGEOFF]
+; SELDAG-NEXT:    blr x8
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _foo2
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+;
+; GISEL-LABEL: rv_marker_5_indirect_call:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Lloh0:
+; GISEL-NEXT:    adrp x8, _fptr at PAGE
+; GISEL-NEXT:  Lloh1:
+; GISEL-NEXT:    ldr x8, [x8, _fptr at PAGEOFF]
+; GISEL-NEXT:    blr x8
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _foo2
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
+; GISEL-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 entry:
   %0 = load ptr, ptr @fptr, align 8
   %call = call ptr %0() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
@@ -144,13 +491,35 @@ entry:
 declare ptr @foo(i64, i64, i64)
 
 define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
-; CHECK-LABEL: _rv_marker_multiarg:
-; CHECK:         mov [[TMP:x[0-9]+]], x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x2, [[TMP]]
-; CHECK-NEXT:    bl  _foo
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_multiarg:
+; SELDAG:       ; %bb.0:
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov x8, x0
+; SELDAG-NEXT:    mov x0, x2
+; SELDAG-NEXT:    mov x2, x8
+; SELDAG-NEXT:    bl _foo
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+;
+; GISEL-LABEL: rv_marker_multiarg:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov x3, x0
+; GISEL-NEXT:    mov x0, x2
+; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    bl _foo
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
   call ptr @foo(i64 %c, i64 %b, i64 %a) [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret void
 }
@@ -158,3 +527,5 @@ define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
 declare ptr @objc_retainAutoreleasedReturnValue(ptr)
 declare ptr @objc_unsafeClaimAutoreleasedReturnValue(ptr)
 declare i32 @__gxx_personality_v0(...)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index f17b9724aadba..0d02cc170a98b 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1100,9 +1100,7 @@ define <8 x i8> @vselect_constant_cond_zero_v8i8(<8 x i8> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI83_0
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI83_0]
-; CHECK-GI-NEXT:    shl v1.8b, v1.8b, #7
-; CHECK-GI-NEXT:    sshr v1.8b, v1.8b, #7
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %b = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> zeroinitializer
   ret <8 x i8> %b
@@ -1119,9 +1117,7 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI84_0
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI84_0]
-; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #15
-; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #15
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> %a, <4 x i16> zeroinitializer
   ret <4 x i16> %b
@@ -1139,12 +1135,12 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI85_1
 ; CHECK-GI-NEXT:    adrp x9, .LCPI85_0
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI85_1]
-; CHECK-GI-NEXT:    ldr d2, [x9, :lo12:.LCPI85_0]
-; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
-; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI85_1]
+; CHECK-GI-NEXT:    ldr d3, [x9, :lo12:.LCPI85_0]
+; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a, <4 x i32> zeroinitializer
   ret <4 x i32> %b
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
index 96f2f63d703c7..7315226ab6265 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
@@ -1,5 +1,5 @@
-; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
-target triple = "aarch64-linux"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc  -mtriple aarch64-unknown --stop-after=finalize-isel -o - | FileCheck %s
 
 ; Check dynamic stack allocation and probing instructions do not have
 ; the FrameSetup flag.
@@ -12,3 +12,5 @@ define void @no_frame_setup(i64 %size, ptr %out) #0 {
 }
 
 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
index 9f3ad8b444446..d60dbe3c8436e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
@@ -450,8 +450,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %zero, %variable
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -476,7 +477,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %variable, %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -500,9 +502,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %variable, %neg1
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -527,8 +529,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %neg1, %variable
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -549,15 +551,8 @@ body: |
     ; CHECK-LABEL: name: fold_and_variable_into_select_undef_neg1_s32
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %undef, %neg1
-    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
-    ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %variable(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
@@ -582,9 +577,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %undef, %zero
-    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND [[ZEXT]], %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -667,9 +661,9 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
     ; CHECK-NEXT: %otherconst:_(s32) = G_CONSTANT i32 123
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %neg1, %otherconst
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %select:_(s32) = G_OR [[SEXT]], %otherconst
     ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
@@ -749,8 +743,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %srem:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %srem:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %srem(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
@@ -802,8 +795,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %udiv:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %udiv:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %udiv(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index f9b98059be0b3..d23f33c90c1d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -5758,42 +5758,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX6-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s10
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[0:1], s11
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX6-NEXT:    s_bfe_u64 s[18:19], s[14:15], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
+; GFX6-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_mov_b32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT:    s_sub_i32 s12, s8, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s8
+; GFX6-NEXT:    s_sub_i32 s7, s8, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s8
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[4:5], s12
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -5804,42 +5805,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX8-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s10
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[0:1], s11
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX8-NEXT:    s_bfe_u64 s[18:19], s[14:15], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
+; GFX8-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_mov_b32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT:    s_sub_i32 s12, s8, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s8
+; GFX8-NEXT:    s_sub_i32 s7, s8, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s8
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[4:5], s12
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -5850,42 +5852,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX9-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s10
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[0:1], s11
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX9-NEXT:    s_bfe_u64 s[18:19], s[14:15], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
+; GFX9-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_mov_b32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT:    s_sub_i32 s12, s8, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s8
+; GFX9-NEXT:    s_sub_i32 s7, s8, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s8
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[4:5], s12
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -5896,42 +5899,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_mov_b32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[2:3], s10
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_bfe_u64 s[18:19], s[12:13], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_and_b64 s[10:11], s[18:19], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX10-NEXT:    s_lshl_b32 s15, s6, 31
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT:    s_sub_i32 s14, s8, 64
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[14:15]
+; GFX10-NEXT:    s_sub_i32 s7, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[4:5], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX10-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -5942,42 +5946,44 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_mov_b32 s12, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_mov_b32 s14, 0
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[2:3], s10
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_bfe_u64 s[18:19], s[12:13], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT:    s_and_b64 s[10:11], s[18:19], s[10:11]
+; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX11-NEXT:    s_lshl_b32 s15, s6, 31
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT:    s_sub_i32 s14, s8, 64
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[14:15]
+; GFX11-NEXT:    s_sub_i32 s7, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s8
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[4:5], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX11-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6000,37 +6006,35 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v15
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v15
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v15
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], v6
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v15
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v14
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v6
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[6:7], 1
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v15
+; GFX6-NEXT:    v_subrev_i32_e32 v13, vcc, 64, v15
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[0:1], v15
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v8
+; GFX6-NEXT:    v_and_b32_e32 v12, v10, v12
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v15
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v13
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v12, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v10
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i128:
@@ -6048,37 +6052,35 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v15
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v15
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v15
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, 64, v15
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v15, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v12, v10, v12
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v15, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v13, v[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v12, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i128:
@@ -6096,22 +6098,21 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v8, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v8, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 31, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v15
-; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v15
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v16, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
@@ -6120,12 +6121,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v12, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v0, v10, v12, v0
+; GFX9-NEXT:    v_and_or_b32 v1, 0, v13, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v8, v11
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v9, v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i128:
@@ -6151,79 +6151,79 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
 ; GFX10-NEXT:    v_or_b32_e32 v11, v11, v9
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX10-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v6, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v8, v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v9, v15, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v19, v[12:13]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s5
+; GFX10-NEXT:    v_and_or_b32 v0, v20, v6, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v2, v5, v8, v2
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v9, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v8
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_not_b32_e32 v8, v8
 ; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v6, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v0, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v8, v14, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v9, v15, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v19, v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v9, v3
+; GFX11-NEXT:    v_and_or_b32 v0, v20, v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_or_b32 v1, 0, v7, v4
+; GFX11-NEXT:    v_and_or_b32 v2, v5, v8, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   ret i128 %result
@@ -6234,19 +6234,18 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v6
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v6
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
+; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 64, v6
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v8
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v7
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX6-NEXT:    s_mov_b32 s8, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
@@ -6254,51 +6253,49 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v9, v5, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v7
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v7
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v7
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v8
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
+; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 64, v8
+; GFX6-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[6:7], s[2:3], v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX6-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_ssv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 64, v6
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX8-NEXT:    s_mov_b32 s8, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
@@ -6306,51 +6303,49 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v9, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v7
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v8, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 64, v8
+; GFX8-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[2:3]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_ssv:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v6
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 64, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -6358,137 +6353,130 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s6, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v8, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v12, 64, v8
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v12, s[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v8, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v0, v9, v4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, 0, v5, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v6, v10
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v7, v11
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_ssv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
+; GFX10-NEXT:    v_and_b32_e32 v10, 0x7f, v0
 ; GFX10-NEXT:    v_not_b32_e32 v0, v0
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX10-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v10
+; GFX10-NEXT:    v_and_b32_e32 v11, 0x7f, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v10, s[2:3]
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v11
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, 64, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v11, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, s[0:1]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v10
+; GFX10-NEXT:    v_or_b32_e32 v13, v2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX10-NEXT:    v_or_b32_e32 v14, v3, v1
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[0:1]
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v11
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v2, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v11, s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v0, s2, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v1, s3, s1
+; GFX10-NEXT:    v_and_or_b32 v0, v12, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0, v5, v6
+; GFX10-NEXT:    v_and_or_b32 v2, v7, v2, v8
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v9
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i128_ssv:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0x7f, v0
 ; GFX11-NEXT:    v_not_b32_e32 v0, v0
 ; GFX11-NEXT:    s_mov_b32 s8, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX11-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v10
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0x7f, v0
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, s[2:3]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v12, 64, v10
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v13, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v14, v3, v1
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v11
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 64, v11
+; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v11, s[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[6:7]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v10
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX11-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v0, s2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v1, s3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v2, v6, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v11, s[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, s4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v7, s5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v9
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v12, v4, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0, v5, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT:    v_and_or_b32 v2, v7, v2, v8
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6503,49 +6491,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX6-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s6
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[0:1], s7
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX6-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s0, s4, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s4
+; GFX6-NEXT:    s_sub_i32 s3, s4, 64
+; GFX6-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
 ; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s5
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s2
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX6-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX6-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_svs:
@@ -6555,49 +6545,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX8-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s6
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[0:1], s7
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX8-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s0, s4, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s4
+; GFX8-NEXT:    s_sub_i32 s3, s4, 64
+; GFX8-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX8-NEXT:    s_and_b32 s3, 1, s2
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_and_b32 s3, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX8-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_svs:
@@ -6607,48 +6599,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX9-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s6
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[0:1], s7
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX9-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s0, s4, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s4
+; GFX9-NEXT:    s_sub_i32 s3, s4, 64
+; GFX9-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s5
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX9-NEXT:    s_and_b32 s3, 1, s2
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_and_b32 s3, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_svs:
@@ -6659,45 +6653,45 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_and_b64 s[6:7], s[12:13], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s2, 64, s4
+; GFX10-NEXT:    s_sub_i32 s3, s4, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s4
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s4, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    s_and_b32 s3, 1, s2
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s5
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_and_b32 s3, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX10-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v0, s6, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -6710,49 +6704,46 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_and_b64 s[6:7], s[12:13], s[6:7]
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX11-NEXT:    s_sub_i32 s2, 64, s4
+; GFX11-NEXT:    s_sub_i32 s3, s4, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s4, 64
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
+; GFX11-NEXT:    s_and_b32 s3, 1, s2
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s5
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_and_b32 s3, 1, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX11-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX11-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6768,47 +6759,48 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX6-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s7
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s6
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s10
+; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    s_bfe_u64 s[6:7], s[10:11], 0x10000
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s9
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX6-NEXT:    s_and_b32 s5, 1, s10
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_and_b32_e32 v7, s7, v9
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s10, s4, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s4
+; GFX6-NEXT:    s_sub_i32 s7, s4, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    v_and_b32_e32 v6, s6, v8
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6822,47 +6814,48 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX8-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    s_mov_b32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX8-NEXT:    s_and_b32 s5, 1, s10
+; GFX8-NEXT:    s_mov_b32 s8, 0
+; GFX8-NEXT:    s_bfe_u64 s[6:7], s[10:11], 0x10000
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s9
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX8-NEXT:    s_and_b32 s5, 1, s10
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, s7, v9
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_sub_i32 s10, s4, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s4
+; GFX8-NEXT:    s_sub_i32 s7, s4, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    v_and_b32_e32 v6, s6, v8
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6876,47 +6869,48 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX9-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX9-NEXT:    s_and_b32 s5, 1, s10
+; GFX9-NEXT:    s_mov_b32 s8, 0
+; GFX9-NEXT:    s_bfe_u64 s[6:7], s[10:11], 0x10000
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s9
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX9-NEXT:    s_and_b32 s5, 1, s10
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, s7, v9
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_sub_i32 s10, s4, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s4
+; GFX9-NEXT:    s_sub_i32 s7, s4, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    v_and_b32_e32 v6, s6, v8
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6927,52 +6921,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_sub_i32 s5, 64, s6
+; GFX10-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX10-NEXT:    s_and_b32 s6, 1, s8
+; GFX10-NEXT:    s_mov_b32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s9, 1, s8
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
-; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    s_and_b32 s5, 1, s9
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_lshl_b32 s11, s2, 31
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX10-NEXT:    s_bfe_u64 s[6:7], s[8:9], 0x10000
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s9
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_sub_i32 s10, s4, 64
-; GFX10-NEXT:    s_sub_i32 s8, 64, s4
+; GFX10-NEXT:    s_sub_i32 s9, s4, 64
+; GFX10-NEXT:    s_sub_i32 s12, 64, s4
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[10:11], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[8:9], 0x10000
+; GFX10-NEXT:    v_and_or_b32 v0, s6, v8, s0
+; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    v_and_or_b32 v1, s7, v9, s1
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -6981,49 +6974,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_sub_i32 s5, 64, s6
+; GFX11-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX11-NEXT:    s_and_b32 s6, 1, s8
+; GFX11-NEXT:    s_mov_b32 s10, 0
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_and_b32 s9, 1, s8
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    s_and_b32 s5, 1, s9
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX11-NEXT:    s_lshl_b32 s11, s2, 31
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX11-NEXT:    s_bfe_u64 s[6:7], s[8:9], 0x10000
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s9
+; GFX11-NEXT:    s_and_b32 s5, 1, s5
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_sub_i32 s10, s4, 64
-; GFX11-NEXT:    s_sub_i32 s8, 64, s4
+; GFX11-NEXT:    s_sub_i32 s9, s4, 64
+; GFX11-NEXT:    s_sub_i32 s12, 64, s4
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
-; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s4
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s9
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[10:11], s[2:3]
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[8:9], 0x10000
+; GFX11-NEXT:    v_and_or_b32 v0, s6, v8, s0
+; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_and_or_b32 v1, s7, v9, s1
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -7160,83 +7154,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX6-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX6-NEXT:    s_lshr_b64 s[28:29], s[0:1], s19
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX6-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
+; GFX6-NEXT:    s_or_b64 s[18:19], s[28:29], s[18:19]
+; GFX6-NEXT:    s_bfe_u64 s[28:29], s[24:25], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX6-NEXT:    s_and_b64 s[26:27], s[28:29], s[26:27]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_mov_b32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
 ; GFX6-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT:    s_sub_i32 s23, s16, 64
-; GFX6-NEXT:    s_sub_i32 s18, 64, s16
+; GFX6-NEXT:    s_sub_i32 s11, s16, 64
+; GFX6-NEXT:    s_sub_i32 s23, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[8:9], s23
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[26:27], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[4:5], s8
+; GFX6-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX6-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX6-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX6-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX6-NEXT:    s_sub_i32 s18, s10, 64
-; GFX6-NEXT:    s_sub_i32 s14, 64, s10
+; GFX6-NEXT:    s_sub_i32 s13, s10, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[8:9], s10
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s13
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[12:13], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[14:15]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[18:19], s[4:5]
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -7247,83 +7243,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX8-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX8-NEXT:    s_lshr_b64 s[28:29], s[0:1], s19
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX8-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
+; GFX8-NEXT:    s_or_b64 s[18:19], s[28:29], s[18:19]
+; GFX8-NEXT:    s_bfe_u64 s[28:29], s[24:25], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX8-NEXT:    s_and_b64 s[26:27], s[28:29], s[26:27]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_mov_b32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
 ; GFX8-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT:    s_sub_i32 s23, s16, 64
-; GFX8-NEXT:    s_sub_i32 s18, 64, s16
+; GFX8-NEXT:    s_sub_i32 s11, s16, 64
+; GFX8-NEXT:    s_sub_i32 s23, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[8:9], s23
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[26:27], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[4:5], s8
+; GFX8-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX8-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX8-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX8-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX8-NEXT:    s_sub_i32 s18, s10, 64
-; GFX8-NEXT:    s_sub_i32 s14, 64, s10
+; GFX8-NEXT:    s_sub_i32 s13, s10, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[8:9], s10
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s13
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[12:13], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[14:15]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[18:19], s[4:5]
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -7334,83 +7332,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX9-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX9-NEXT:    s_lshr_b64 s[28:29], s[0:1], s19
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX9-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
+; GFX9-NEXT:    s_or_b64 s[18:19], s[28:29], s[18:19]
+; GFX9-NEXT:    s_bfe_u64 s[28:29], s[24:25], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX9-NEXT:    s_and_b64 s[26:27], s[28:29], s[26:27]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_mov_b32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
 ; GFX9-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT:    s_sub_i32 s23, s16, 64
-; GFX9-NEXT:    s_sub_i32 s18, 64, s16
+; GFX9-NEXT:    s_sub_i32 s11, s16, 64
+; GFX9-NEXT:    s_sub_i32 s23, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[8:9], s23
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[26:27], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[4:5], s8
+; GFX9-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX9-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX9-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX9-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX9-NEXT:    s_sub_i32 s18, s10, 64
-; GFX9-NEXT:    s_sub_i32 s14, 64, s10
+; GFX9-NEXT:    s_sub_i32 s13, s10, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[8:9], s10
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s13
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[12:13], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[14:15]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[18:19], s[4:5]
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -7421,83 +7421,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX10-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX10-NEXT:    s_mov_b32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mov_b32 s24, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX10-NEXT:    s_lshl_b64 s[28:29], s[2:3], s18
 ; GFX10-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX10-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX10-NEXT:    s_bfe_u64 s[28:29], s[22:23], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX10-NEXT:    s_and_b64 s[18:19], s[28:29], s[18:19]
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX10-NEXT:    s_lshl_b32 s25, s10, 31
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s23, s16, 64
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
+; GFX10-NEXT:    s_sub_i32 s11, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s16
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[8:9], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX10-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX10-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX10-NEXT:    s_lshl_b32 s25, s14, 31
 ; GFX10-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s18, s10, 64
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[24:25]
+; GFX10-NEXT:    s_sub_i32 s15, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s10
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[12:13], s11
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s15
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[16:17], s[12:13]
 ; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX10-NEXT:    s_bfe_u64 s[12:13], s[14:15], 0x10000
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -7508,83 +7510,87 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX11-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX11-NEXT:    s_mov_b32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_mov_b32 s24, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX11-NEXT:    s_lshl_b64 s[28:29], s[2:3], s18
 ; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX11-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX11-NEXT:    s_bfe_u64 s[28:29], s[22:23], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX11-NEXT:    s_and_b64 s[18:19], s[28:29], s[18:19]
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX11-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX11-NEXT:    s_lshl_b32 s25, s10, 31
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s23, s16, 64
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
+; GFX11-NEXT:    s_sub_i32 s11, s16, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s16
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[8:9], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX11-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
 ; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX11-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX11-NEXT:    s_lshl_b32 s25, s14, 31
 ; GFX11-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s18, s10, 64
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[24:25]
+; GFX11-NEXT:    s_sub_i32 s15, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s10
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[12:13], s11
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[12:13], s15
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX11-NEXT:    s_cselect_b64 s[12:13], s[16:17], s[12:13]
 ; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX11-NEXT:    s_bfe_u64 s[12:13], s[14:15], 0x10000
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
@@ -7607,39 +7613,38 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX6-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v24
-; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v24
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v10
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v24
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v23
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 31, v10
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[10:11], 1
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
+; GFX6-NEXT:    v_subrev_i32_e32 v22, vcc, 64, v24
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[0:1], v24
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v16
+; GFX6-NEXT:    v_and_b32_e32 v21, v18, v21
+; GFX6-NEXT:    v_lshr_b64 v[18:19], v[8:9], v24
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[8:9], v22
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v16
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX6-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX6-NEXT:    v_not_b32_e32 v8, v20
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v18
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
@@ -7648,39 +7653,36 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v19
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v19
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v19
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v19
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v14
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 31, v14
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[14:15], 1
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 64, v19
+; GFX6-NEXT:    v_and_b32_e32 v16, v10, v16
+; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v19
+; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v12
+; GFX6-NEXT:    v_lshr_b64 v[14:15], v[8:9], v19
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[8:9], v17
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v12
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX6-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX6-NEXT:    v_or_b32_e32 v11, v11, v13
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v14
+; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
 ; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX6-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i128:
@@ -7698,39 +7700,38 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX8-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v24
-; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 31, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[10:11]
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
+; GFX8-NEXT:    v_subrev_u32_e32 v22, vcc, 64, v24
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v24, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[8:9]
+; GFX8-NEXT:    v_and_b32_e32 v21, v18, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v22, v[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v10, v10, v16
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX8-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX8-NEXT:    v_not_b32_e32 v8, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
@@ -7739,39 +7740,36 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v19
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v19
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 31, v14
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[14:15]
+; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 64, v19
+; GFX8-NEXT:    v_and_b32_e32 v16, v10, v16
+; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v19, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v12, v[8:9]
+; GFX8-NEXT:    v_lshrrev_b64 v[14:15], v19, v[8:9]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v17, v[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v10, v10, v12
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX8-NEXT:    v_or_b32_e32 v11, v11, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v14
+; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX8-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i128:
@@ -7789,54 +7787,52 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX9-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, v1, v17, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v16, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v16, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 31, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
-; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v24
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v25, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v0, v18, v21, v0
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX9-NEXT:    v_not_b32_e32 v8, v20
-; GFX9-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v16, v19
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v17, v23
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v5, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v8, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v14, 31, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v19
@@ -7853,199 +7849,187 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX9-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v22, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX9-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, 0, v22, v1
+; GFX9-NEXT:    v_and_or_b32 v4, v20, v16, v4
+; GFX9-NEXT:    v_and_or_b32 v5, 0, v17, v5
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v12, v18
+; GFX9-NEXT:    v_and_or_b32 v7, 0, v13, v21
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT:    v_not_b32_e32 v16, v16
+; GFX10-NEXT:    v_not_b32_e32 v21, v16
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v21
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v27
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
-; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
-; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v21, v[0:1]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v28
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v28
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v28
-; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s5, 64, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v16, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v18, v[10:11]
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v29, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v27, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX10-NEXT:    v_not_b32_e32 v3, v20
-; GFX10-NEXT:    v_or_b32_e32 v1, v22, v8
+; GFX10-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v21, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v19, v3, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v28, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v24, s5
+; GFX10-NEXT:    v_and_b32_e32 v23, 0x7f, v20
+; GFX10-NEXT:    v_not_b32_e32 v11, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v23
+; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v11
+; GFX10-NEXT:    v_and_or_b32 v2, v10, v2, v21
+; GFX10-NEXT:    v_and_or_b32 v0, v18, v0, v8
+; GFX10-NEXT:    v_and_or_b32 v1, 0, v1, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT:    v_and_b32_e32 v22, 0x7f, v3
-; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v16, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v23, v[6:7]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v25, 64, v23
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v23, v[4:5]
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
-; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v24
+; GFX10-NEXT:    v_or_b32_e32 v13, v11, v13
+; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v25, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX10-NEXT:    v_or_b32_e32 v5, v11, v13
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v22
-; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v22
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
-; GFX10-NEXT:    v_or_b32_e32 v3, v23, v25
-; GFX10-NEXT:    v_or_b32_e32 v4, v13, v5
-; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v23
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v24
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX10-NEXT:    v_or_b32_e32 v18, v18, v20
+; GFX10-NEXT:    v_or_b32_e32 v19, v19, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v10, v18, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v11, v19, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v24, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v4, v6, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT:    v_and_or_b32 v7, 0, v11, v7
+; GFX10-NEXT:    v_and_or_b32 v4, v25, v16, v4
+; GFX10-NEXT:    v_and_or_b32 v5, 0, v17, v8
+; GFX10-NEXT:    v_and_or_b32 v6, v9, v10, v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT:    v_not_b32_e32 v16, v16
+; GFX11-NEXT:    v_not_b32_e32 v21, v16
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
+; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22
-; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v28
-; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v28
-; GFX11-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v21
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v27
 ; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v21, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v21, v16, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v28
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v28
+; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s1, 64, v28
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v21, v2, s0
+; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v18, v[10:11]
+; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v29, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v23, v23, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
-; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s0
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX11-NEXT:    v_not_b32_e32 v3, v20
-; GFX11-NEXT:    v_or_b32_e32 v1, v22, v8
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v22, v19, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v24, s1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
+; GFX11-NEXT:    v_and_b32_e32 v23, 0x7f, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v19, v3, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v28, v[10:11]
+; GFX11-NEXT:    v_not_b32_e32 v11, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc_lo
+; GFX11-NEXT:    v_and_or_b32 v0, v18, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v23
+; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v11
+; GFX11-NEXT:    v_and_or_b32 v1, 0, v1, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 0x7f, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_and_or_b32 v2, v10, v2, v21
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v16, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[12:13], v23, v[6:7]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v24
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v25, 64, v23
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v23, v[4:5]
 ; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v24
+; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX11-NEXT:    v_or_b32_e32 v13, v11, v13
 ; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v25, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
-; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s0
-; GFX11-NEXT:    v_or_b32_e32 v3, v23, v25
-; GFX11-NEXT:    v_or_b32_e32 v4, v13, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v23
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v24
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v23
+; GFX11-NEXT:    v_or_b32_e32 v18, v18, v20
+; GFX11-NEXT:    v_or_b32_e32 v19, v19, v21
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v12 :: v_dual_cndmask_b32 v5, v5, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v10, v18, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v11, v19, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v24, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v4, v6, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX11-NEXT:    v_and_or_b32 v7, 0, v11, v7
+; GFX11-NEXT:    v_and_or_b32 v4, v25, v16, v4
+; GFX11-NEXT:    v_and_or_b32 v5, 0, v17, v8
+; GFX11-NEXT:    v_and_or_b32 v6, v9, v10, v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index c8455665e7b40..5be266e141edf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -5814,40 +5814,37 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s11, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_sub_i32 s2, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX6-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[12:13], s2
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s14, s10, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s10
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[12:13], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s9, s10, 64
+; GFX6-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[6:7], s10
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[6:7], s11
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
@@ -5860,40 +5857,37 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s11, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_sub_i32 s2, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX8-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[12:13], s2
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
-; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s14, s10, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s10
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[12:13], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT:    s_sub_i32 s9, s10, 64
+; GFX8-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[6:7], s10
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[6:7], s11
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
@@ -5906,40 +5900,37 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s11, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_sub_i32 s2, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[12:13], s2
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
-; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s14, s10, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s10
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[12:13], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s9, s10, 64
+; GFX9-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[6:7], s10
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[6:7], s11
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
@@ -5954,37 +5945,34 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s14, s10, 64
+; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
 ; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -6000,37 +5988,35 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX11-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s14, s10, 64
+; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
 ; GFX11-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6041,145 +6027,127 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT:    v_and_b32_e32 v12, 0x7f, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
+; GFX6-NEXT:    v_and_b32_e32 v13, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v15
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v13
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v0
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v15
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
-; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v15
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v13
+; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v13
 ; GFX6-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[8:9], v16
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[8:9], v14
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v14
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v14
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v15
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], v14
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v12
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v12
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[6:7], v8
+; GFX6-NEXT:    v_subrev_i32_e32 v13, vcc, 64, v12
+; GFX6-NEXT:    v_or_b32_e32 v8, v0, v8
+; GFX6-NEXT:    v_or_b32_e32 v9, v1, v9
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v13
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[6:7], v12
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT:    v_and_b32_e32 v12, 0x7f, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
+; GFX8-NEXT:    v_and_b32_e32 v13, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v15
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v13
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
-; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v13, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v13
 ; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v14, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v14
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v12
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v12, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, 64, v12
+; GFX8-NEXT:    v_or_b32_e32 v8, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v1, v9
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v13, v[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v12, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT:    v_and_b32_e32 v12, 0x7f, v8
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
+; GFX9-NEXT:    v_and_b32_e32 v13, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v15
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v13
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v13, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v13
 ; GFX9-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v14, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v14
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v12, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v13, 64, v12
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v13, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v12, v[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v12, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v8, v10
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v9, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i128:
@@ -6188,98 +6156,90 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_not_b32_e32 v9, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
-; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX10-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX10-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v21, v[6:7]
+; GFX10-NEXT:    v_or_b32_e32 v8, v10, v8
+; GFX10-NEXT:    v_or_b32_e32 v9, v11, v9
+; GFX10-NEXT:    v_or_b32_e32 v10, v14, v16
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
-; GFX10-NEXT:    v_or_b32_e32 v12, v12, v16
-; GFX10-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX10-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v13, v13, v17
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v14, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v8, v15, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v6, v5, s5
+; GFX10-NEXT:    v_and_or_b32 v2, v7, v12, v2
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v13, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v8, v10, v8
+; GFX11-NEXT:    v_or_b32_e32 v9, v11, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
+; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
+; GFX11-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v21, v[6:7]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v16
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v10, v14, v16
+; GFX11-NEXT:    v_or_b32_e32 v8, v15, v17
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v13, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v14, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v10, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v6, v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_and_or_b32 v2, v7, v12, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   ret i128 %result
@@ -6288,106 +6248,94 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s9, 0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0x7f, v0
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v4
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[10:11], v0
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v7
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[10:11], v7
+; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v4
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 64, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v8
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v5
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v6
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v6
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v6
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v6
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v8
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
+; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v8
+; GFX6-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v9
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[6:7], s[6:7], v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v5, s5
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s9, 0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0x7f, v0
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v4
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 64, v4
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, s[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v6
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v8, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v4, s[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, 64, v8
+; GFX8-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v9, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_ssv:
@@ -6395,51 +6343,45 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s9, 0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7f, v0
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v5, 64, v4
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, s[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v9, 64, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v9, s[6:7]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v4, v7
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v5, v8
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_ssv:
@@ -6450,96 +6392,82 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 64, v13
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v13
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v12
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v6, s[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[10:11]
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[10:11]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v12, s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, s[6:7]
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v8
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v0, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v10, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v11, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v1, s3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, s8, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s9, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, s5, s1
+; GFX10-NEXT:    v_and_or_b32 v2, v5, v2, v6
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v7
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_not_b32_e32 v1, v0
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
 ; GFX11-NEXT:    s_mov_b32 s9, 0
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v1
-; GFX11-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v12, s[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 64, v13
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v13
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v12
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v6, s[6:7]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[10:11]
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[10:11]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, s[6:7]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v0, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v10, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v11, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v1, s3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v8, s8, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, s9, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, s5, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v5, v2, v6
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v7
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6557,47 +6485,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s7, s4, 64
-; GFX6-NEXT:    s_sub_i32 s5, 64, s4
+; GFX6-NEXT:    s_sub_i32 s2, 64, s4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s0, s6, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s6
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[8:9], s7
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s3, s6, 64
+; GFX6-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s6
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s7
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s2
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX6-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
@@ -6610,47 +6535,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s7, s4, 64
-; GFX8-NEXT:    s_sub_i32 s5, 64, s4
+; GFX8-NEXT:    s_sub_i32 s2, 64, s4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s0, s6, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s6
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[8:9], s7
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_sub_i32 s3, s6, 64
+; GFX8-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s7
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX8-NEXT:    s_and_b32 s3, 1, s2
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_and_b32 s3, 1, s5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX8-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
@@ -6663,47 +6585,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s7, s4, 64
-; GFX9-NEXT:    s_sub_i32 s5, 64, s4
+; GFX9-NEXT:    s_sub_i32 s2, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s0, s6, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s6
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[8:9], s7
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s3, s6, 64
+; GFX9-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s7
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX9-NEXT:    s_and_b32 s3, 1, s2
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_and_b32 s3, 1, s5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX9-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
@@ -6719,44 +6638,39 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s6
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s6, 64
+; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s2, 64, s6
+; GFX10-NEXT:    s_sub_i32 s3, s6, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, 1, s2
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s7
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_and_b32 s3, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX10-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i128_svs:
@@ -6772,47 +6686,38 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s6
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s6, 64
+; GFX11-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX11-NEXT:    s_sub_i32 s2, 64, s6
+; GFX11-NEXT:    s_sub_i32 s3, s6, 64
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_and_b32 s3, 1, s2
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s7
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_and_b32 s3, 1, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX11-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX11-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6822,161 +6727,152 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_vss:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s5, s4, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_sub_i32 s5, s4, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s9
-; GFX6-NEXT:    s_sub_i32 s10, s6, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s6
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s4, 1, s4
+; GFX6-NEXT:    s_sub_i32 s5, s6, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[2:3], s6
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX6-NEXT:    v_or_b32_e32 v0, s0, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, s1, v5
-; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x10000
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_vss:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s5, s4, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_sub_i32 s5, s4, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s8
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_and_b32 s4, 1, s9
-; GFX8-NEXT:    s_sub_i32 s10, s6, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s6
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
+; GFX8-NEXT:    s_and_b32 s5, 1, s8
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    s_sub_i32 s5, s6, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[2:3], s6
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, s1, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x10000
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_vss:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s5, s4, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_sub_i32 s5, s4, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s8
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_and_b32 s4, 1, s9
-; GFX9-NEXT:    s_sub_i32 s10, s6, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s6
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
+; GFX9-NEXT:    s_and_b32 s5, 1, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    s_sub_i32 s5, s6, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[2:3], s6
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v4
-; GFX9-NEXT:    v_or_b32_e32 v1, s1, v5
-; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x10000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_vss:
@@ -6984,50 +6880,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s7, 64, s4
+; GFX10-NEXT:    s_andn2_b64 s[6:7], 0x7f, s[4:5]
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], 0x7f
+; GFX10-NEXT:    s_sub_i32 s5, 64, s6
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT:    s_sub_i32 s5, s4, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s8
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    s_and_b32 s4, 1, s9
-; GFX10-NEXT:    s_sub_i32 s10, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_sub_i32 s7, s4, 64
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_and_b32 s5, 1, s6
+; GFX10-NEXT:    s_sub_i32 s10, 64, s4
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
-; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -7037,48 +6930,46 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s7, 64, s4
+; GFX11-NEXT:    s_and_not1_b64 s[6:7], 0x7f, s[4:5]
+; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], 0x7f
+; GFX11-NEXT:    s_sub_i32 s5, 64, s6
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT:    s_sub_i32 s5, s4, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_and_b32 s4, 1, s8
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    s_and_b32 s4, 1, s9
-; GFX11-NEXT:    s_sub_i32 s10, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
-; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_and_b32 s5, 1, s5
+; GFX11-NEXT:    s_sub_i32 s7, s4, 64
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT:    s_and_b32 s5, 1, s6
+; GFX11-NEXT:    s_sub_i32 s10, 64, s4
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s4
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
-; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -7223,40 +7114,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
 ; GFX6-NEXT:    s_sub_i32 s19, s16, 64
-; GFX6-NEXT:    s_sub_i32 s17, 64, s16
+; GFX6-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
+; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[22:23], s2
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[22:23], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s24, s18, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s18
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s17, s18, 64
+; GFX6-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[10:11], s18
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[10:11], s19
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[22:23]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
@@ -7264,40 +7152,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX6-NEXT:    s_sub_i32 s9, s10, 64
-; GFX6-NEXT:    s_sub_i32 s11, 64, s10
+; GFX6-NEXT:    s_sub_i32 s6, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX6-NEXT:    s_sub_i32 s18, s8, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[16:17], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX6-NEXT:    s_sub_i32 s11, s8, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
-; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[14:15], s8
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[14:15], s9
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
@@ -7310,40 +7195,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
 ; GFX8-NEXT:    s_sub_i32 s19, s16, 64
-; GFX8-NEXT:    s_sub_i32 s17, 64, s16
+; GFX8-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[22:23], s2
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[22:23], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s24, s18, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s18
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT:    s_sub_i32 s17, s18, 64
+; GFX8-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[10:11], s18
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[10:11], s19
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[22:23]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
@@ -7351,40 +7233,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX8-NEXT:    s_sub_i32 s9, s10, 64
-; GFX8-NEXT:    s_sub_i32 s11, 64, s10
+; GFX8-NEXT:    s_sub_i32 s6, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX8-NEXT:    s_sub_i32 s18, s8, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[16:17], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX8-NEXT:    s_sub_i32 s11, s8, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
-; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[14:15], s8
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[14:15], s9
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
@@ -7397,40 +7276,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
 ; GFX9-NEXT:    s_sub_i32 s19, s16, 64
-; GFX9-NEXT:    s_sub_i32 s17, 64, s16
+; GFX9-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[22:23], s2
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[22:23], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s24, s18, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s18
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s17, s18, 64
+; GFX9-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[10:11], s18
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[10:11], s19
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[22:23]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
@@ -7438,40 +7314,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX9-NEXT:    s_sub_i32 s9, s10, 64
-; GFX9-NEXT:    s_sub_i32 s11, 64, s10
+; GFX9-NEXT:    s_sub_i32 s6, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX9-NEXT:    s_sub_i32 s18, s8, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[16:17], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-NEXT:    s_sub_i32 s11, s8, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[14:15], s8
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[14:15], s9
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
@@ -7488,76 +7361,70 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s16
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX10-NEXT:    s_or_b64 s[16:17], s[24:25], s[16:17]
 ; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s22, s18, 64
+; GFX10-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX10-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
 ; GFX10-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX10-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_i32 s18, s8, 64
+; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s11
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_bfe_u64 s[10:11], s[10:11], 0x10000
+; GFX10-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -7575,76 +7442,71 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s16
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX11-NEXT:    s_or_b64 s[16:17], s[24:25], s[16:17]
 ; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s22, s18, 64
+; GFX11-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX11-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
 ; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
 ; GFX11-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX11-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
 ; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX11-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX11-NEXT:    s_sub_i32 s18, s8, 64
+; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_lshr_b64 s[14:15], s[14:15], s11
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT:    s_bfe_u64 s[10:11], s[10:11], 0x10000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
@@ -7655,274 +7517,238 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX6-NEXT:    v_and_b32_e32 v21, 0x7f, v16
 ; GFX6-NEXT:    v_not_b32_e32 v16, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
+; GFX6-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v24
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v22
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[16:17], v0
-; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v24
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v24
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[16:17], v24
+; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v22
+; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v22
 ; GFX6-NEXT:    v_or_b32_e32 v18, v0, v18
 ; GFX6-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v25
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v23
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v23
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v23
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v21
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v21
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[10:11], v16
+; GFX6-NEXT:    v_subrev_i32_e32 v22, vcc, 64, v21
+; GFX6-NEXT:    v_or_b32_e32 v16, v0, v16
+; GFX6-NEXT:    v_or_b32_e32 v17, v1, v17
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v22
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v21
+; GFX6-NEXT:    v_lshr_b64 v[18:19], v[10:11], v21
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v20
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX6-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v19
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v17
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v19
-; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v19
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v19
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v17
+; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v17
 ; GFX6-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v20
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v18
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX6-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v18
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v18
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v19
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[14:15], v18
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v16
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v16
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[14:15], v8
+; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, 64, v16
+; GFX6-NEXT:    v_or_b32_e32 v8, v4, v8
+; GFX6-NEXT:    v_or_b32_e32 v9, v5, v9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v17
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[14:15], v16
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX6-NEXT:    v_or_b32_e32 v6, v10, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX8-NEXT:    v_and_b32_e32 v21, 0x7f, v16
 ; GFX8-NEXT:    v_not_b32_e32 v16, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
+; GFX8-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v24
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v22
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v24
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
+; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v22, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v22
 ; GFX8-NEXT:    v_or_b32_e32 v18, v0, v18
 ; GFX8-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v23, v[16:17]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v21, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
+; GFX8-NEXT:    v_subrev_u32_e32 v22, vcc, 64, v21
+; GFX8-NEXT:    v_or_b32_e32 v16, v0, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, v1, v17
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v22, v[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v21, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v20
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v19
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v17
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, 64, v17
 ; GFX8-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v18, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX8-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v18
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
-; GFX8-NEXT:    v_or_b32_e32 v6, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v18, v[14:15]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v16
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[14:15]
+; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, 64, v16
+; GFX8-NEXT:    v_or_b32_e32 v8, v4, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v5, v9
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v17, v[14:15]
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v16, v[14:15]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX8-NEXT:    v_or_b32_e32 v6, v10, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX9-NEXT:    v_and_b32_e32 v21, 0x7f, v16
 ; GFX9-NEXT:    v_not_b32_e32 v16, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
+; GFX9-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v24
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v22
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
+; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v22, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v22
 ; GFX9-NEXT:    v_or_b32_e32 v18, v0, v18
 ; GFX9-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v23, v[16:17]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v23
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v21
 ; GFX9-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v21, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v22, 64, v21
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v22, v[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v21, v[10:11]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_not_b32_e32 v8, v20
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v17, v19
+; GFX9-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v16, v18
+; GFX9-NEXT:    v_subrev_u32_e32 v18, 64, v17
 ; GFX9-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v18, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX9-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v18
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_subrev_u32_e32 v17, 64, v16
 ; GFX9-NEXT:    v_or_b32_e32 v6, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v18, v[14:15]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v17, v[14:15]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v16, v[14:15]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
-; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX9-NEXT:    v_or_b32_e32 v6, v10, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v8, v10
+; GFX9-NEXT:    v_and_or_b32 v7, 0, v9, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i128:
@@ -7930,90 +7756,78 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v17, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v26, 0x7f, v16
+; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v17
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v26
-; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v19
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v19, 64, v25
-; GFX10-NEXT:    v_lshlrev_b64 v[23:24], v25, v[0:1]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v23, 64, v19
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
 ; GFX10-NEXT:    v_lshrrev_b64 v[17:18], v18, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v19, v[0:1]
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, 0, v24, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v26
-; GFX10-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v1, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v18, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v0, v21, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v27, v[10:11]
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX10-NEXT:    v_or_b32_e32 v17, v17, v19
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v16, v17, v21
+; GFX10-NEXT:    v_or_b32_e32 v17, v18, v22
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v0, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v1, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v24, v[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v18, v[10:11]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v19, 64, v24
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v21, v2, vcc_lo
+; GFX10-NEXT:    v_not_b32_e32 v21, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s4
-; GFX10-NEXT:    v_not_b32_e32 v16, v20
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v24, v[10:11]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v19, v[10:11]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 31, v5
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX10-NEXT:    v_and_b32_e32 v23, 0x7f, v20
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
-; GFX10-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v25, v[4:5]
-; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
-; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v23
-; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX10-NEXT:    v_or_b32_e32 v5, v9, v11
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[14:15]
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v3, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v23, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v16, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v25
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v18, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v1, v24, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v12, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v9, v13, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
-; GFX10-NEXT:    v_or_b32_e32 v3, v22, v26
-; GFX10-NEXT:    v_or_b32_e32 v4, v11, v5
-; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v6, v6, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v20, 0x7f, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v17, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v19, v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX10-NEXT:    v_sub_nc_u32_e32 v23, 64, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v24, v10, v16
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_or_b32_e32 v25, v11, v17
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v20, v[12:13]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v23, v[14:15]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s4
+; GFX10-NEXT:    v_and_or_b32 v2, v21, v2, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v24, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v5, v25, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v19
+; GFX10-NEXT:    v_or_b32_e32 v10, v10, v16
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v20
+; GFX10-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v20, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v18, v7, s4
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s5
+; GFX10-NEXT:    v_and_or_b32 v6, v10, v8, v6
+; GFX10-NEXT:    v_and_or_b32 v7, 0, v9, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i128:
@@ -8021,100 +7835,80 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v17, v16
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v25, 0x7f, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v17
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX11-NEXT:    v_lshlrev_b64 v[23:24], v25, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v23, 0, v23 :: v_dual_and_b32 v26, 0x7f, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0, v24, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v25
-; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v19, 64, v25
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v26
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v23, 64, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
 ; GFX11-NEXT:    v_lshrrev_b64 v[17:18], v18, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v19, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v1, v22, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v18, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v0, v21, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v27, v[10:11]
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s0
-; GFX11-NEXT:    v_not_b32_e32 v16, v20
+; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v16, v17, v21
+; GFX11-NEXT:    v_or_b32_e32 v17, v18, v22
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v21, v0, v16 :: v_dual_cndmask_b32 v22, v1, v17
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v24, v[8:9]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v18, v[10:11]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v19, 64, v24
 ; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s0
+; GFX11-NEXT:    v_not_b32_e32 v21, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0x7f, v16
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v24, v[10:11]
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v19, v[10:11]
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 31, v5
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
-; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s0
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v25, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v25
-; GFX11-NEXT:    v_or_b32_e32 v1, v24, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX11-NEXT:    v_and_b32_e32 v23, 0x7f, v20
-; GFX11-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10
-; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v23
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[14:15]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v23, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v23
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s2
-; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v16, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v18, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v10
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v8, v12, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v9, v13, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s0
-; GFX11-NEXT:    v_or_b32_e32 v3, v22, v26
-; GFX11-NEXT:    v_or_b32_e32 v4, v11, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v19
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v16
+; GFX11-NEXT:    v_and_b32_e32 v20, 0x7f, v20
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v10, v0 :: v_dual_cndmask_b32 v1, v11, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v24
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v17, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v19, v[6:7]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v23, 64, v20
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s0
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX11-NEXT:    v_or_b32_e32 v24, v10, v16
+; GFX11-NEXT:    v_or_b32_e32 v25, v11, v17
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v20, v[12:13]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v23, v[14:15]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v20
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
+; GFX11-NEXT:    v_and_or_b32 v2, v21, v2, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v19
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v4, v24 :: v_dual_cndmask_b32 v18, v5, v25
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v8, v[14:15]
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v16
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v20
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v20, v[14:15]
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v18, v7, s0
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s1
+; GFX11-NEXT:    v_and_or_b32 v6, v10, v8, v6
+; GFX11-NEXT:    v_and_or_b32 v7, 0, v9, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index b50af195e40ef..15038b2da9e73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -674,10 +674,12 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; SI-NEXT:  .LBB6_3: ; %.continue0
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; SI-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
-; SI-NEXT:    s_nop 1
+; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -722,10 +724,12 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:  .LBB6_3: ; %.continue0
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -770,8 +774,10 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:  .LBB6_3: ; %.continue0
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_xor_b32 s2, s0, -1
 ; GFX10-32-NEXT:    s_mov_b32 s1, s0
-; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-32-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -816,8 +822,10 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:  .LBB6_3: ; %.continue0
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX10-64-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -911,10 +919,12 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_cbranch_execz .LBB7_8
 ; SI-NEXT:  .LBB7_5: ; %.continue0
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; SI-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
+; SI-NEXT:    v_and_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_mov_b32_e32 v3, v2
-; SI-NEXT:    s_nop 1
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -977,10 +987,12 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX9-NEXT:  .LBB7_5: ; %.continue0
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1042,8 +1054,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX10-32-NEXT:  .LBB7_5: ; %.continue0
 ; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
 ; GFX10-32-NEXT:    s_mov_b32 s2, s0
-; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s2
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s3
+; GFX10-32-NEXT:    v_and_b32_e32 v2, v2, v0
 ; GFX10-32-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-32-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-32-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1106,8 +1120,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX10-64-NEXT:  .LBB7_5: ; %.continue0
 ; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-64-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
 ; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
+; GFX10-64-NEXT:    v_and_b32_e32 v2, v2, v0
 ; GFX10-64-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-64-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-64-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 980ba3da4bac7..508f226e2872e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1583,14 +1583,15 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v3
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v2
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65:
@@ -1605,14 +1606,15 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v2, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65:
@@ -1627,14 +1629,15 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v2, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65:
@@ -1650,13 +1653,14 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v2, v6, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, v7, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65:
@@ -1671,13 +1675,14 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_or_b32_e32 v2, v6, v8
 ; GFX11-NEXT:    v_or_b32_e32 v6, v7, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    v_and_b32_e32 v2, v3, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, %amount
   ret i65 %result
@@ -1750,45 +1755,45 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-LABEL: s_lshr_i65:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT:    s_sub_i32 s10, s3, 64
-; GCN-NEXT:    s_sub_i32 s8, 64, s3
+; GCN-NEXT:    s_sub_i32 s12, s3, 64
+; GCN-NEXT:    s_sub_i32 s10, 64, s3
 ; GCN-NEXT:    s_cmp_lt_u32 s3, 64
-; GCN-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-NEXT:    s_cselect_b32 s2, 1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-NEXT:    s_cselect_b32 s12, 1, 0
+; GCN-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-NEXT:    s_lshr_b64 s[8:9], s[0:1], s3
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GCN-NEXT:    s_cmp_lg_u32 s13, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT:    s_sub_i32 s2, 64, s3
+; GFX10PLUS-NEXT:    s_sub_i32 s12, s3, 64
+; GFX10PLUS-NEXT:    s_sub_i32 s8, 64, s3
 ; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s13, 1, 0
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX10PLUS-NEXT:    s_lshr_b64 s[10:11], s[4:5], s3
 ; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX10PLUS-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], s[10:11]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0ea89d9..bb06252902fd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5126,9 +5126,10 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[6:7], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5168,14 +5169,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5215,14 +5215,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5259,16 +5258,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s10
-; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX10-NEXT:    v_and_b32_e32 v1, s1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    s_add_u32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
@@ -5300,16 +5298,14 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s10
-; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, s1, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX11-NEXT:    s_add_u32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
@@ -5346,9 +5342,10 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX6-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX6-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v3, v6
@@ -5377,9 +5374,10 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v3, v6
@@ -5408,9 +5406,10 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v3, v6
@@ -5434,14 +5433,15 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_co_u32 v6, s0, 0x80000000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s0, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -5461,14 +5461,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v6 :: v_dual_and_b32 v2, v2, v8
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_co_u32 v6, null, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_add_co_u32 v6, null, 0x80000000, v3
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5495,12 +5495,13 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5521,7 +5522,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -5530,9 +5531,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT:    s_and_b32 s0, s4, 1
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -5555,7 +5555,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -5564,9 +5564,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s4, 1
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -5586,19 +5585,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
+; GFX10-NEXT:    v_and_b32_e32 v1, s0, v8
 ; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5616,20 +5614,19 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], 0
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
+; GFX11-NEXT:    v_and_b32_e32 v1, s0, v8
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5655,13 +5652,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5679,13 +5677,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX6-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -5707,13 +5706,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5731,13 +5731,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -5759,13 +5760,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5783,13 +5785,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -5810,44 +5813,46 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
+; GFX10-NEXT:    v_and_b32_e32 v1, v19, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
-; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v3, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v2i128:
@@ -5861,43 +5866,42 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v4, v12
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, v19, v18
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
+; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[12:13], v[6:7]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_and_b32 v2, v5, v4
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_add_co_u32 v7, null, 0x80000000, v6
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
+; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v3 :: v_dual_and_b32 v5, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s0
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v3 :: v_dual_cndmask_b32 v3, v17, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v10, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -5921,9 +5925,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5954,9 +5959,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[14:15], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6000,14 +6006,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6039,14 +6044,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s6
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX8-NEXT:    s_and_b32 s4, s6, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6090,14 +6094,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6129,14 +6132,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s6
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    s_and_b32 s4, s6, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6177,17 +6179,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s18
-; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    s_add_u32 s11, s10, 0x80000000
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    s_add_u32 s0, s4, s12
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v1, s1, v2
 ; GFX10-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX10-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6204,27 +6205,26 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10-NEXT:    s_and_b32 s4, 1, s12
-; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT:    s_and_b32 s5, 1, s5
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v2, s5, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s16
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
@@ -6254,17 +6254,15 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s18
-; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
 ; GFX11-NEXT:    s_add_u32 s11, s10, 0x80000000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX11-NEXT:    s_add_u32 s0, s4, s12
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, s1, v2
 ; GFX11-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX11-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6274,23 +6272,23 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[14:15], 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_and_b32 v0, 1, v0
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX11-NEXT:    s_and_b32 s4, 1, s12
-; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX11-NEXT:    s_and_b32 s5, 1, s5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v2, s5, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s17
@@ -6299,7 +6297,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index c2f911cc44587..d0da2f485a113 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1585,14 +1585,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[2:3], v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v3
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[0:1], v3
-; GFX6-NEXT:    v_or_b32_e32 v9, v4, v5
+; GFX6-NEXT:    v_or_b32_e32 v7, v4, v5
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i65:
@@ -1603,14 +1604,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v3
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v9, v4, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i65:
@@ -1621,14 +1623,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v3
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v9, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v7, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i65:
@@ -1636,15 +1639,16 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 64, v3
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1654,14 +1658,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 64, v3
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX11-NEXT:    v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v7, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1720,41 +1725,43 @@ define i65 @v_shl_i65_33(i65 %value) {
 define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-LABEL: s_shl_i65:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_sub_i32 s10, s3, 64
-; GCN-NEXT:    s_sub_i32 s6, 64, s3
+; GCN-NEXT:    s_sub_i32 s5, s3, 64
+; GCN-NEXT:    s_sub_i32 s8, 64, s3
 ; GCN-NEXT:    s_cmp_lt_u32 s3, 64
-; GCN-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s3, 0
 ; GCN-NEXT:    s_cselect_b32 s12, 1, 0
-; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], s3
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[0:1], s3
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[8:9]
+; GCN-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[2:3], s3
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[0:1], s3
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GCN-NEXT:    s_bfe_u64 s[0:1], s[4:5], 0x10000
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[10:11]
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i65:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT:    s_sub_i32 s4, 64, s3
+; GFX10PLUS-NEXT:    s_sub_i32 s5, s3, 64
+; GFX10PLUS-NEXT:    s_sub_i32 s6, 64, s3
 ; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[2:3], s3
-; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[0:1], s3
-; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[0:1], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[2:3], s3
+; GFX10PLUS-NEXT:    s_lshl_b64 s[10:11], s[0:1], s3
+; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT:    s_bfe_u64 s[8:9], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_lshl_b64 s[12:13], s[0:1], s5
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[8:9], s[10:11]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[12:13]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i65 %value, %amount
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index a60370cd460f9..ea3d0f9a841ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2473,14 +2473,15 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i48:
@@ -2490,8 +2491,10 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2502,22 +2505,39 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i48:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i48:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i48:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
 }
@@ -2532,14 +2552,14 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_subb_u32 s2, s1, s3
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_cmp_lg_u32 s2, s1
+; GFX6-NEXT:    s_cmp_eq_u32 s2, s1
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i48:
@@ -2548,7 +2568,10 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_xor_b32 s2, s2, 1
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2558,7 +2581,10 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_xor_b32 s2, s2, 1
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -2568,7 +2594,10 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
@@ -2583,14 +2612,15 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_sv:
@@ -2600,8 +2630,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2612,21 +2644,37 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i48_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i48_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i48_sv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   %ext.result = zext i48 %result to i64
   %cast = bitcast i64 %ext.result to <2 x float>
@@ -2641,14 +2689,15 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_vs:
@@ -2658,8 +2707,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2670,21 +2721,37 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i48_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i48_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i48_vs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   %ext.result = zext i48 %result to i64
   %cast = bitcast i64 %ext.result to <2 x float>
@@ -2697,8 +2764,10 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i64:
@@ -2706,8 +2775,10 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i64:
@@ -2715,18 +2786,32 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i64:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
 }
@@ -2736,28 +2821,40 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s3
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_xor_b32 s2, s2, 1
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_xor_b32 s2, s2, 1
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_xor_b32 s2, s2, 1
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -2769,8 +2866,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i64_sv:
@@ -2778,8 +2877,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i64_sv:
@@ -2787,17 +2888,30 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i64_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i64_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i64_sv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   %cast = bitcast i64 %result to <2 x float>
   ret <2 x float> %cast
@@ -2809,8 +2923,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i64_vs:
@@ -2818,8 +2934,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i64_vs:
@@ -2827,17 +2945,30 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i64_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i64_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i64_vs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   %cast = bitcast i64 %result to <2 x float>
   ret <2 x float> %cast
@@ -2849,12 +2980,16 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v2, v6
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i64:
@@ -2862,12 +2997,16 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v2, v6
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v3, v7, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i64:
@@ -2875,38 +3014,49 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v2, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v7, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT:    v_sub_co_u32 v2, s4, v2, v6
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s4
+; GFX10-NEXT:    v_sub_co_u32 v1, s4, v2, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v2, s4, v3, v7, s4
+; GFX10-NEXT:    s_xor_b32 s5, vcc_lo, -1
+; GFX10-NEXT:    s_xor_b32 s4, s4, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT:    v_sub_co_u32 v2, s0, v2, v6
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s0
+; GFX11-NEXT:    v_sub_co_u32 v1, s0, v2, v6
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v2, s0, v3, v7, s0
+; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_xor_b32 s0, s0, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -2917,40 +3067,64 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s5
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_xor_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX6-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s7
-; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_xor_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s5
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_xor_b32 s4, s4, 1
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX8-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s7
-; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_xor_b32 s4, s4, 1
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s5
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s7
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_v2i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX10PLUS-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -2963,8 +3137,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX6-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s7
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_xor_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i128:
@@ -2973,8 +3151,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX8-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s7
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_xor_b32 s4, s4, 1
+; GFX8-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_i128:
@@ -2983,8 +3165,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX9-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s7
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_i128:
@@ -2993,8 +3179,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   ret i128 %result
@@ -3006,14 +3196,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i128_sv:
@@ -3021,14 +3213,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v4, v3, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i128_sv:
@@ -3036,27 +3230,43 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i128_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i128_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i128_sv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v3, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v2, v3, v2
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
   ret <4 x float> %cast
@@ -3068,14 +3278,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v1, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i128_vs:
@@ -3083,14 +3295,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i128_vs:
@@ -3098,27 +3312,43 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v4, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i128_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i128_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i128_vs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v3, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v2, v3, v2
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
   ret <4 x float> %cast
@@ -3130,20 +3360,24 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
-; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v5, v13, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v6, v14, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v7, v15, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v10, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v11, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v4, v12
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v5, v13, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v6, v14, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v7, v15, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v4, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v6, v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i128:
@@ -3151,20 +3385,24 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v13, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v14, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v10, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v3, v11, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v4, v12
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v5, v13, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v14, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v7, v15, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v4, v5, v1
+; GFX8-NEXT:    v_and_b32_e32 v6, v5, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i128:
@@ -3172,62 +3410,72 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v12
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v13, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v14, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v10, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v11, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v4, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v13, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v14, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v15, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v4, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, v5, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
-; GFX10-NEXT:    v_sub_co_u32 v4, s4, v4, v12
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s4
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v10, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v3, v11, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v4, v12
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v5, v13, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v14, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v4, v6, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, v6, v5
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
-; GFX11-NEXT:    v_sub_co_u32 v4, s0, v4, v12
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v10, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v3, v11, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v4, v12
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v5, v13, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v14, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v7, v15, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, v2, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, v6, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v6, v6, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -3240,14 +3488,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX6-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s11
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_xor_b32 s8, s8, 1
+; GFX6-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX6-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX6-NEXT:    s_subb_u32 s7, s7, s15
-; GFX6-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX6-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_xor_b32 s8, s8, 1
+; GFX6-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v2i128:
@@ -3256,14 +3512,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX8-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s11
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_xor_b32 s8, s8, 1
+; GFX8-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX8-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX8-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX8-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX8-NEXT:    s_subb_u32 s7, s7, s15
-; GFX8-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX8-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_xor_b32 s8, s8, 1
+; GFX8-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_v2i128:
@@ -3272,14 +3536,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX9-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s11
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_xor_b32 s8, s8, 1
+; GFX9-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX9-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX9-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s15
-; GFX9-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX9-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_xor_b32 s8, s8, 1
+; GFX9-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_v2i128:
@@ -3288,14 +3560,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s11
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX10PLUS-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX10PLUS-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX10PLUS-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX10PLUS-NEXT:    s_subb_u32 s7, s7, s15
-; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX10PLUS-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX10PLUS-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 354f5b954659a..ab326a6cef3e3 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1117,13 +1117,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1201,13 +1202,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1286,13 +1288,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1371,13 +1374,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 794b10eea58b9..12d856725c682 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1517,10 +1517,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v1, v1
@@ -1558,10 +1559,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; IR-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; IR-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
-; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; IR-IEEE-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
 ; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c94693..b516660f3bdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -230,15 +230,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
 ; VI-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
 ; VI-SAFE-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
 ; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; VI-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
@@ -358,20 +359,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
@@ -497,24 +499,24 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 046f262469695..2e8db0e1439a9 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1850,10 +1850,11 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1886,10 +1887,11 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1922,10 +1924,11 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1957,10 +1960,11 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s[4:5]
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2090,16 +2094,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2232,16 +2238,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, s[4:5]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, v2
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[6:7]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[6:7]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2328,10 +2336,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
@@ -2425,10 +2434,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
@@ -2509,10 +2519,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
@@ -2589,10 +2600,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
@@ -2658,16 +2670,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
@@ -2802,16 +2816,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v6
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v5, -16, v5
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v4, -16, v4
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -2929,16 +2945,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v6
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v5, -16, v5
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v4, -16, v4
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3029,10 +3047,11 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3064,10 +3083,11 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3099,10 +3119,11 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3698,10 +3719,11 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3733,10 +3755,11 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3768,10 +3791,11 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3911,12 +3935,13 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v0, v1
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v2, v0, v2
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v2
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GISEL-IEEE-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 196a3705ac818..32da005ccb000 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -40,8 +40,8 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -53,11 +53,10 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -99,9 +98,9 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -v[0:1], v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -113,11 +112,10 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -160,9 +158,9 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -174,11 +172,10 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -221,9 +218,9 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -|v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -235,11 +232,10 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -284,8 +280,8 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -297,11 +293,10 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -344,8 +339,8 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -357,11 +352,10 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -404,8 +398,8 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -417,11 +411,10 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -464,8 +457,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -477,11 +470,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -533,8 +525,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -546,11 +538,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -602,8 +593,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -615,11 +606,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -671,8 +661,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -684,11 +674,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -740,8 +729,8 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -753,11 +742,10 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -800,8 +788,8 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -813,11 +801,10 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -860,8 +847,8 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -873,11 +860,10 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -920,8 +906,8 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -933,11 +919,10 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -980,8 +965,8 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -993,11 +978,10 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1058,14 +1042,15 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1078,23 +1063,22 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff80, v17
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
@@ -1139,8 +1123,8 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1152,11 +1136,10 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1198,9 +1181,9 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1212,11 +1195,10 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1260,8 +1242,8 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1273,11 +1255,10 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1338,14 +1319,15 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1358,23 +1340,22 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff80, v17
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
@@ -1419,8 +1400,8 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1432,11 +1413,10 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1479,8 +1459,8 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1492,11 +1472,10 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1539,8 +1518,8 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1552,11 +1531,10 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1599,8 +1577,8 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1612,11 +1590,10 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1677,14 +1654,15 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1697,23 +1675,22 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff80, v17
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
@@ -1795,61 +1772,62 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v18
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 8, v20
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v14
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
-; GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[4:5]
-; GISEL-NEXT:    v_mul_f64 v[12:13], v[6:7], 0.5
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 8, v19
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
+; GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
 ; GISEL-NEXT:    v_mul_f64 v[6:7], v[0:1], v[6:7]
-; GISEL-NEXT:    v_mul_f64 v[14:15], v[8:9], 0.5
-; GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[8:9]
-; GISEL-NEXT:    v_mul_f64 v[16:17], v[10:11], 0.5
-; GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5
-; GISEL-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5
-; GISEL-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5
-; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
-; GISEL-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
-; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
-; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
-; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
-; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v12, vcc
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, v12, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v14
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v13
-; GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v15
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v13
-; GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v12
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[14:15], v[10:11], 0.5
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[2:3], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
+; GISEL-NEXT:    v_rsq_f64_e32 v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[16:17], v[8:9], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[8:9], -v[14:15], v[10:11], 0.5
+; GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], 0.5
+; GISEL-NEXT:    v_mul_f64 v[12:13], v[4:5], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[8:9], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[14:15], v[8:9], v[14:15]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[16:17], v[12:13], 0.5
+; GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[14:15], v[16:17], v[14:15], v[16:17]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[10:11], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[8:9], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[14:15], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[10:11], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[16:17], v[8:9], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[12:13], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v17, 0xffffff80, v18
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v16
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v17
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v16
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v16
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[12:13]
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff80, v19
+; GISEL-NEXT:    v_and_b32_e32 v13, 0xffffff80, v20
+; GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v12
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index b3912aea55f79..fcc57b8bb7075 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -102,9 +102,9 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GISEL-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
@@ -164,8 +164,8 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; GISEL-NEXT:    s_add_u32 s0, s0, 1
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:  .LBB5_2: ; %endif
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 2c07b47bf1ed5..f21e8989640e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -53,14 +53,15 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   ret { half, i32 } %result
@@ -150,9 +151,10 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 1
@@ -227,20 +229,22 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v6, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v5, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v6
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v3
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v4, v6
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, v3, v4
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
@@ -396,12 +400,14 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v3
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v2
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
@@ -470,14 +476,15 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   ret { half, i16 } %result
@@ -563,9 +570,10 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   %result.0 = extractvalue { half, i16 } %result, 1
@@ -631,10 +639,11 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
@@ -714,9 +723,10 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   %result.0 = extractvalue { float, i32 } %result, 1
@@ -780,14 +790,16 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, v4, v3
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -904,12 +916,14 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v3
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, v3
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   %result.1 = extractvalue { <2 x float>, <2 x i32> } %result, 1
@@ -963,10 +977,11 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6]
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1053,9 +1068,10 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 1
@@ -1126,15 +1142,17 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[5:6], v[0:1]
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v4, v7, v4
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v5, v[2:3]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v5, v8, v5
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1184,18 +1202,18 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 }
 
 define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
-; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
 ; GFX8:       ; %bb.0:
@@ -1217,6 +1235,21 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], |v[0:1]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[2:3]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
   ret <2 x i32> %result.1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index a0b2d3b32b795..59376d0a595fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -5567,9 +5567,10 @@ define float @v_log_f32_undef() {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5608,15 +5609,16 @@ define float @v_log_f32_undef() {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5649,9 +5651,10 @@ define float @v_log_f32_undef() {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5686,21 +5689,22 @@ define float @v_log_f32_undef() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5ba72612321a6..624648b73515b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -5567,9 +5567,10 @@ define float @v_log10_f32_undef() {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5608,15 +5609,16 @@ define float @v_log10_f32_undef() {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5649,9 +5651,10 @@ define float @v_log10_f32_undef() {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5686,21 +5689,22 @@ define float @v_log10_f32_undef() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 6ccef4c02ab3b..f40c13d0762b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -2541,9 +2541,10 @@ define float @v_log2_f32_undef() {
 ; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX689-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX689-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX689-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -2560,10 +2561,11 @@ define float @v_log2_f32_undef() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 3dc565ceed0d0..3363f92ad21a7 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -62,13 +62,13 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -78,9 +78,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -146,8 +145,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -159,11 +158,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -241,14 +239,14 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -258,9 +256,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -325,9 +322,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -339,11 +336,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -423,13 +419,13 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -439,9 +435,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -507,8 +502,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -520,11 +515,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -602,14 +596,14 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], -s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -619,9 +613,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -686,9 +679,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], -s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -700,11 +693,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -784,13 +776,13 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -800,9 +792,8 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -866,8 +857,8 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -879,11 +870,10 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -952,14 +942,14 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -969,9 +959,8 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1034,9 +1023,9 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1048,11 +1037,10 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1123,13 +1111,13 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -1139,9 +1127,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1205,8 +1192,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1218,11 +1205,10 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1292,13 +1278,13 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -1308,9 +1294,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1374,8 +1359,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1387,11 +1372,10 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1461,13 +1445,13 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -1477,9 +1461,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
@@ -1543,8 +1526,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1556,11 +1539,10 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
@@ -1664,29 +1646,29 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -1697,13 +1679,13 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -1811,14 +1793,15 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1831,23 +1814,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
@@ -1965,29 +1947,29 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -1998,13 +1980,13 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -2112,14 +2094,15 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2132,23 +2115,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
@@ -2234,28 +2216,28 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -2266,13 +2248,13 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -2352,14 +2334,15 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2372,23 +2355,22 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
@@ -2507,28 +2489,28 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -2539,13 +2521,13 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -2655,14 +2637,15 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2675,23 +2658,22 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
@@ -2774,14 +2756,14 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -|v[0:1]|, v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -2791,9 +2773,8 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -2856,9 +2837,9 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -|v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -2870,11 +2851,10 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -2946,13 +2926,13 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -2962,9 +2942,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -3028,8 +3007,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3041,11 +3020,10 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -3107,12 +3085,12 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3122,9 +3100,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3177,8 +3154,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3190,11 +3167,10 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3252,12 +3228,12 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3267,9 +3243,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3322,8 +3297,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3335,11 +3310,10 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3398,12 +3372,12 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3413,9 +3387,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3470,8 +3443,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3483,11 +3456,10 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3546,12 +3518,12 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3561,9 +3533,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3616,8 +3587,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3629,11 +3600,10 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3691,12 +3661,12 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3706,9 +3676,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3761,8 +3730,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3774,11 +3743,10 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3836,12 +3804,12 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3851,9 +3819,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3906,8 +3873,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3919,11 +3886,10 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3982,12 +3948,12 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3997,9 +3963,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4054,8 +4019,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4067,11 +4032,10 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4138,13 +4102,13 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4154,9 +4118,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -4220,8 +4183,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4233,11 +4196,10 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -4325,44 +4287,44 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff80, v12
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 8, v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v12
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v13
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v12
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
@@ -4447,14 +4409,15 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -4467,28 +4430,27 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
@@ -4550,12 +4512,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4565,9 +4527,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4622,8 +4583,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4635,11 +4596,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4706,12 +4666,12 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4721,9 +4681,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4776,8 +4735,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4789,11 +4748,10 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -5112,12 +5070,12 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5127,9 +5085,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5193,8 +5150,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5206,11 +5163,10 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v7, 0xffffff80, v10
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v7
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5279,12 +5235,12 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5294,9 +5250,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5360,8 +5315,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5373,11 +5328,10 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v7, 0xffffff80, v10
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v7
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5446,12 +5400,12 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5461,9 +5415,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5527,8 +5480,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5540,11 +5493,10 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v7, 0xffffff80, v10
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v7
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5616,17 +5568,17 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -5634,9 +5586,8 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
@@ -5702,10 +5653,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -5717,11 +5668,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]



More information about the llvm-commits mailing list