[llvm] [GlobalIsel] Combine selects with constants (PR #76089)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 20 10:46:11 PST 2023


https://github.com/tschuett created https://github.com/llvm/llvm-project/pull/76089

A first small step at combining selects.

>From e90cf84e9bd6a7aec88c347cd4a768818044094f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Wed, 20 Dec 2023 19:43:04 +0100
Subject: [PATCH] [GlobalIsel]  Combine selects with constants

A first small step at combining selects.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   18 +-
 .../include/llvm/Target/GlobalISel/Combine.td |   15 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  352 ++-
 .../AArch64/GlobalISel/combine-select.mir     |  246 ++
 llvm/test/CodeGen/AArch64/andcompare.ll       |   14 +-
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |  124 +-
 llvm/test/CodeGen/AArch64/call-rv-marker.ll   |  447 ++-
 .../AArch64/neon-bitwise-instructions.ll      |   20 +-
 .../combine-fold-binop-into-select.mir        |   42 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |    6 +-
 .../GlobalISel/divergence-structurizer.ll     |    6 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 2192 ++++++++-------
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 2416 ++++++++---------
 .../GlobalISel/llvm.amdgcn.wqm.demote.ll      |   48 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |   77 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  349 ++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  111 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |  894 +++---
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |   32 +-
 llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll    |   14 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |   46 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |  199 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |  498 ++--
 .../AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll  |    8 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  157 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |   40 +-
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |   40 +-
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |   14 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |  950 +++----
 29 files changed, 5174 insertions(+), 4201 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index e7debc652a0a8b..dcc1a4580b14a2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -769,9 +769,6 @@ class CombinerHelper {
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
                                                 BuildFnTy &MatchInfo);
 
-  /// Fold boolean selects to logical operations.
-  bool matchSelectToLogical(MachineInstr &MI, BuildFnTy &MatchInfo);
-
   bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info);
 
   /// Transform G_ADD(x, G_SUB(y, x)) to y.
@@ -814,6 +811,9 @@ class CombinerHelper {
   // Given a binop \p MI, commute operands 1 and 2.
   void applyCommuteBinOpOperands(MachineInstr &MI);
 
+  /// Combine selects.
+  bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
@@ -904,6 +904,18 @@ class CombinerHelper {
   /// select (fcmp uge x, 1.0) 1.0, x -> fminnm x, 1.0
   bool matchFPSelectToMinMax(Register Dst, Register Cond, Register TrueVal,
                              Register FalseVal, BuildFnTy &MatchInfo);
+
+  /// Try to fold selects to logical operations.
+  bool tryFoldBoolSelectToLogic(GSelect *Select, BuildFnTy &MatchInfo);
+
+  bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo);
+
+  bool isOneOrOneSplat(Register Src, bool AllowUndefs);
+  bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
+  bool isConstantSplatVector(Register Src, int64_t SplatValue,
+                             bool AllowUndefs);
+
+  std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
 };
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 77db371adaf776..6bda80681432a0 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -437,13 +437,6 @@ def select_constant_cmp: GICombineRule<
   (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${matchinfo}); }])
 >;
 
-def select_to_logical : GICombineRule<
-  (defs root:$root, build_fn_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_SELECT):$root,
-    [{ return Helper.matchSelectToLogical(*${root}, ${matchinfo}); }]),
-  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
->;
-
 // Fold (C op x) -> (x op C)
 // TODO: handle more isCommutable opcodes
 // TODO: handle compares (currently not marked as isCommutable)
@@ -1242,6 +1235,12 @@ def select_to_minmax: GICombineRule<
          [{ return Helper.matchSimplifySelectToMinMax(*${root}, ${info}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;
 
+def match_selects : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$root,
+        [{ return Helper.matchSelect(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1282,7 +1281,7 @@ def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
 def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp,
-                                      select_to_logical]>;
+                                      match_selects]>;
 
 def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd,
                                        mul_by_neg_one, idempotent_prop]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 91a64d59e154df..072a73ded170bd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5940,62 +5940,6 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
   return false;
 }
 
-bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo) {
-  GSelect &Sel = cast<GSelect>(MI);
-  Register DstReg = Sel.getReg(0);
-  Register Cond = Sel.getCondReg();
-  Register TrueReg = Sel.getTrueReg();
-  Register FalseReg = Sel.getFalseReg();
-
-  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
-  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
-
-  const LLT CondTy = MRI.getType(Cond);
-  const LLT OpTy = MRI.getType(TrueReg);
-  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
-    return false;
-
-  // We have a boolean select.
-
-  // select Cond, Cond, F --> or Cond, F
-  // select Cond, 1, F    --> or Cond, F
-  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
-  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, Cond, FalseReg);
-    };
-    return true;
-  }
-
-  // select Cond, T, Cond --> and Cond, T
-  // select Cond, T, 0    --> and Cond, T
-  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
-  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, Cond, TrueReg);
-    };
-    return true;
-  }
-
- // select Cond, T, 1 --> or (not Cond), T
-  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
-    };
-    return true;
-  }
-
-  // select Cond, 0, F --> and (not Cond), F
-  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
-    MatchInfo = [=](MachineIRBuilder &MIB) {
-      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
-    };
-    return true;
-  }
-  return false;
-}
-
 bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
                                             unsigned &IdxToPropagate) {
   bool PropagateNaN;
@@ -6318,3 +6262,299 @@ void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   MI.getOperand(2).setReg(LHSReg);
   Observer.changedInstr(MI);
 }
+
+bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 1, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 1;
+  }
+  return false; // scalable vector
+}
+
+bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) {
+  LLT SrcTy = MRI.getType(Src);
+  if (SrcTy.isFixedVector())
+    return isConstantSplatVector(Src, 0, AllowUndefs);
+  if (SrcTy.isScalar()) {
+    if (AllowUndefs && getOpcodeDef<GImplicitDef>(Src, MRI) != nullptr)
+      return true;
+    auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+    return IConstant && IConstant->Value == 0;
+  }
+  return false; // scalable vector
+}
+
+// Ignores COPYs during conformance checks.
+// FIXME scalable vectors.
+bool CombinerHelper::isConstantSplatVector(Register Src, int64_t SplatValue,
+                                           bool AllowUndefs) {
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return false;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  for (unsigned I = 0; I < NumSources; ++I) {
+    GImplicitDef *ImplicitDef =
+        getOpcodeDef<GImplicitDef>(BuildVector->getSourceReg(I), MRI);
+    if (ImplicitDef && AllowUndefs)
+      continue;
+    if (ImplicitDef && !AllowUndefs)
+      return false;
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (IConstant && IConstant->Value == SplatValue)
+      continue;
+    return false;
+  }
+  return true;
+}
+
+// Ignores COPYs during lookups.
+// FIXME scalable vectors
+std::optional<APInt>
+CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
+  auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
+  if (IConstant)
+    return IConstant->Value;
+
+  GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
+  if (!BuildVector)
+    return std::nullopt;
+  unsigned NumSources = BuildVector->getNumSources();
+
+  std::optional<APInt> Value = std::nullopt;
+  for (unsigned I = 0; I < NumSources; ++I) {
+    std::optional<ValueAndVReg> IConstant =
+        getIConstantVRegValWithLookThrough(BuildVector->getSourceReg(I), MRI);
+    if (!IConstant)
+      return std::nullopt;
+    if (!Value)
+      Value = IConstant->Value;
+    else if (*Value != IConstant->Value)
+      return std::nullopt;
+  }
+  return Value;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register Dest = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // Either both are scalars or both are vectors.
+  std::optional<APInt> TrueOpt = getConstantOrConstantSplatVector(True);
+  std::optional<APInt> FalseOpt = getConstantOrConstantSplatVector(False);
+
+  if (!TrueOpt || !FalseOpt)
+    return false;
+
+  // These are only the splat values.
+  APInt TrueValue = *TrueOpt;
+  APInt FalseValue = *FalseOpt;
+
+  // Boolean or fixed vector of booleans.
+  if (CondTy.isScalableVector() ||
+      (CondTy.isFixedVector() &&
+       CondTy.getElementType().getScalarSizeInBits() != 1) ||
+      CondTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // select Cond, 1, 0 --> zext (Cond)
+  if (TrueValue.isOne() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildZExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, -1, 0 --> sext (Cond)
+  if (TrueValue.isAllOnes() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      B.buildSExtOrTrunc(Dest, Cond);
+    };
+    return true;
+  }
+
+  // select Cond, 0, 1 --> zext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isOne()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildZExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, 0, -1 --> sext (!Cond)
+  if (TrueValue.isZero() && FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      B.buildSExtOrTrunc(Dest, Inner);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+  if (TrueValue - 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+  if (TrueValue + 1 == FalseValue) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildAdd(Dest, Inner, False);
+    };
+    return true;
+  }
+
+  // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+  if (TrueValue.isPowerOf2() && FalseValue.isZero()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Inner, Cond);
+      // The shift amount must be scalar.
+      LLT ShiftTy = TrueTy.isVector() ? TrueTy.getElementType() : TrueTy;
+      auto ShAmtC = B.buildConstant(ShiftTy, TrueValue.exactLogBase2());
+      B.buildShl(Dest, Inner, ShAmtC, Flags);
+    };
+    return true;
+  }
+  // select Cond, -1, C --> or (sext Cond), C
+  if (TrueValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Cond);
+      B.buildOr(Dest, Inner, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, C, -1 --> or (sext (not Cond)), C
+  if (FalseValue.isAllOnes()) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Not = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Not, Cond);
+      Register Inner = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildSExtOrTrunc(Inner, Not);
+      B.buildOr(Dest, Inner, True, Flags);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+// TODO: use knownbits to determine zeros
+bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
+                                              BuildFnTy &MatchInfo) {
+  uint32_t Flags = Select->getFlags();
+  Register DstReg = Select->getReg(0);
+  Register Cond = Select->getCondReg();
+  Register True = Select->getTrueReg();
+  Register False = Select->getFalseReg();
+  LLT CondTy = MRI.getType(Select->getCondReg());
+  LLT TrueTy = MRI.getType(Select->getTrueReg());
+
+  // Boolean or fixed vector of booleans.
+  if (CondTy.isScalableVector() ||
+      (CondTy.isFixedVector() &&
+       CondTy.getElementType().getScalarSizeInBits() != 1) ||
+      CondTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  if ((Cond == True) || isOneOrOneSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildOr(DstReg, Ext, False, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  if ((Cond == False) || isZeroOrZeroSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Cond);
+      B.buildAnd(DstReg, Ext, True);
+    };
+    return true;
+  }
+
+  // select Cond, T, 1 --> or (not Cond), T
+  if (isOneOrOneSplat(False, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildOr(DstReg, Ext, True, Flags);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (isZeroOrZeroSplat(True, /* AllowUndefs */ true)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.setInstrAndDebugLoc(*Select);
+      // First the not.
+      Register Inner = MRI.createGenericVirtualRegister(CondTy);
+      B.buildNot(Inner, Cond);
+      // Then an ext to match the destination register.
+      Register Ext = MRI.createGenericVirtualRegister(TrueTy);
+      B.buildZExtOrTrunc(Ext, Inner);
+      B.buildAnd(DstReg, Ext, False);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  GSelect *Select = cast<GSelect>(&MI);
+
+  if (tryFoldSelectOfConstants(Select, MatchInfo))
+    return true;
+
+  if (tryFoldBoolSelectToLogic(Select, MatchInfo))
+    return true;
+
+  return false;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 81d38a5b080470..be2de620fa456c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -298,3 +298,249 @@ body:             |
     %ext:_(s32) = G_ANYEXT %sel
     $w0 = COPY %ext(s32)
 ...
+---
+# select cond, 1, 0 --> zext(Cond)
+name:            select_cond_1_0_to_zext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_0_to_zext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %one, %zero
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, 1 --> zext(!Cond)
+name:            select_cond_0_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %zero:_(s1) = G_CONSTANT i1 0
+    %one:_(s1) = G_CONSTANT i1 1
+    %sel:_(s1) = G_SELECT %c, %zero, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 2, 1 --> and (zext Cond), false
+name:            select_cond_2_1_to_and_zext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_2_1_to_and_zext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[ZEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 102
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 1, 2 --> and (ext Cond), false
+name:            select_cond_1_2_to_and_sext_cond_false
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_1_2_to_and_sext_cond_false
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 102
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_ADD [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 102
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 64, 0 --> (zext Cond) << log2(Pow2)
+name:            select_cond_64_0_to_shift
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_64_0_to_shift
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %c(s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
+    ; CHECK-NEXT: %sel:_(s8) = G_SHL [[ZEXT]], [[C]](s8)
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 64
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 0 --> sext Cond
+name:            select_cond_minus_1_0_to_sext_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_0_to_sext_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT %c(s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 0
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 0, -1 --> sext (!Cond)
+name:            select_cond_0_minus_1_to_sext_not_cond
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_0_minus_1_to_sext_not_cond
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: %ext:_(s32) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 0
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, -1, 101 --> or (sext Cond), 101
+name:            select_cond_minus_1_101_to_or_sext_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_minus_1_101_to_or_sext_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %one:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT %c(s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %one
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 255
+    %one:_(s8) = G_CONSTANT i8 101
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
+---
+# select cond, 101, -1 --> or (sext (not Cond), 101
+name:            select_cond_101_minus_1_to_or_sext_not_cond_101
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: select_cond_101_minus_1_to_or_sext_not_cond_101
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: %two:_(s8) = G_CONSTANT i8 101
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s8) = G_SEXT [[XOR]](s1)
+    ; CHECK-NEXT: %sel:_(s8) = G_OR [[SEXT]], %two
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s8)
+    ; CHECK-NEXT: $w0 = COPY %ext(s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %c:_(s1) = G_TRUNC %0
+    %t:_(s1) = G_TRUNC %1
+    %f:_(s1) = G_TRUNC %2
+    %two:_(s8) = G_CONSTANT i8 101
+    %one:_(s8) = G_CONSTANT i8 255
+    %sel:_(s8) = G_SELECT %c, %two, %one
+    %ext:_(s32) = G_ANYEXT %sel
+    $w0 = COPY %ext(s32)
+...
diff --git a/llvm/test/CodeGen/AArch64/andcompare.ll b/llvm/test/CodeGen/AArch64/andcompare.ll
index 9a7fa04982990b..18715a26973397 100644
--- a/llvm/test/CodeGen/AArch64/andcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andcompare.ll
@@ -2414,7 +2414,8 @@ define i32 @cmp_to_ands1(i32 %num) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    and w8, w0, #0xff
 ; GISEL-NEXT:    cmp w8, #1
-; GISEL-NEXT:    csel w0, w8, wzr, hi
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 255
   %cmp = icmp ugt i32 %and, 1
@@ -2434,7 +2435,8 @@ define i32 @cmp_to_ands2(i32 %num) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    and w8, w0, #0xfe
 ; GISEL-NEXT:    cmp w8, #63
-; GISEL-NEXT:    csel w0, w8, wzr, hi
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 254
   %cmp = icmp ugt i32 %and, 63
@@ -2451,10 +2453,11 @@ define i32 @cmp_to_ands3(i32 %num, i32 %a) {
 ;
 ; GISEL-LABEL: cmp_to_ands3:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #23
+; GISEL-NEXT:    mov w8, #23 // =0x17
 ; GISEL-NEXT:    and w8, w0, w8
 ; GISEL-NEXT:    cmp w8, #7
-; GISEL-NEXT:    csel w0, w1, wzr, hi
+; GISEL-NEXT:    cset w8, hi
+; GISEL-NEXT:    and w0, w8, w1
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 23
   %cmp = icmp ugt i32 %and, 7
@@ -2514,7 +2517,8 @@ define i32 @cmp_to_ands6(i32 %num) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    and w8, w0, #0xfe
 ; GISEL-NEXT:    cmp w8, #16
-; GISEL-NEXT:    csel w0, w8, wzr, hs
+; GISEL-NEXT:    cset w9, hs
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and = and i32 %num, 254
   %cmp = icmp uge i32 %and, 16
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 821f6e403a2713..5b6560084095ba 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -14,7 +14,7 @@ define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB0_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -42,7 +42,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB1_2: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_different:
@@ -55,7 +55,7 @@ define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB1_2: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 5
@@ -88,7 +88,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; SDISEL-NEXT:  LBB2_3: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_flagclobber:
@@ -106,7 +106,7 @@ define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; GISEL-NEXT:  LBB2_3: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -144,7 +144,7 @@ define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB3_3: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -178,13 +178,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; SDISEL-NEXT:    ccmp w8, #16, #0, ge
 ; SDISEL-NEXT:    b.le LBB4_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB4_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: speculate_division:
@@ -194,13 +194,13 @@ define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 ; GISEL-NEXT:    ccmp w8, #17, #0, gt
 ; GISEL-NEXT:    b.lt LBB4_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB4_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -230,13 +230,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; SDISEL-NEXT:    fccmp s0, s1, #8, ge
 ; SDISEL-NEXT:    b.ge LBB5_2
 ; SDISEL-NEXT:  ; %bb.1: ; %if.end
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ; SDISEL-NEXT:  LBB5_2: ; %if.then
 ; SDISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; SDISEL-NEXT:    bl _foo
 ; SDISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; SDISEL-NEXT:    mov w0, #7
+; SDISEL-NEXT:    mov w0, #7 ; =0x7
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: single_fcmp:
@@ -248,13 +248,13 @@ define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 ; GISEL-NEXT:    fccmp s0, s1, #8, gt
 ; GISEL-NEXT:    b.ge LBB5_2
 ; GISEL-NEXT:  ; %bb.1: ; %if.end
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  LBB5_2: ; %if.then
 ; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; GISEL-NEXT:    bl _foo
 ; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; GISEL-NEXT:    mov w0, #7
+; GISEL-NEXT:    mov w0, #7 ; =0x7
 ; GISEL-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -318,7 +318,7 @@ define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB7_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -346,13 +346,13 @@ define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    cmp w1, #32
 ; CHECK-NEXT:    b.eq LBB8_3
 ; CHECK-NEXT:  ; %bb.2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB8_3: ; %if.then
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -380,7 +380,7 @@ define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB9_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -408,7 +408,7 @@ define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB10_2: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 0
@@ -466,7 +466,7 @@ define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_and:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #0, ne
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -488,7 +488,7 @@ define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
 ;
 ; GISEL-LABEL: select_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    csel x0, x2, x3, lt
@@ -510,7 +510,7 @@ define float @select_or_float(i32 %w0, i32 %w1, float %x2, float %x3) {
 ;
 ; GISEL-LABEL: select_or_float:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #5
+; GISEL-NEXT:    mov w8, #5 ; =0x5
 ; GISEL-NEXT:    cmp w8, w1
 ; GISEL-NEXT:    ccmp w0, w1, #8, eq
 ; GISEL-NEXT:    fcsel s0, s0, s1, lt
@@ -528,17 +528,22 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 ; SDISEL-NEXT:    cmp x0, #2
 ; SDISEL-NEXT:    ccmp x0, #4, #4, ne
 ; SDISEL-NEXT:    ccmp x1, #0, #0, eq
-; SDISEL-NEXT:    mov w8, #1
+; SDISEL-NEXT:    mov w8, #1 ; =0x1
 ; SDISEL-NEXT:    cinc x0, x8, eq
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: gccbug:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #2
+; GISEL-NEXT:    cmp x1, #0
+; GISEL-NEXT:    cset w8, eq
 ; GISEL-NEXT:    cmp x0, #2
-; GISEL-NEXT:    ccmp x0, #4, #4, ne
-; GISEL-NEXT:    ccmp x1, #0, #0, eq
-; GISEL-NEXT:    csinc x0, x8, xzr, eq
+; GISEL-NEXT:    cset w9, eq
+; GISEL-NEXT:    cmp x0, #4
+; GISEL-NEXT:    cset w10, eq
+; GISEL-NEXT:    orr w9, w10, w9
+; GISEL-NEXT:    and w8, w9, w8
+; GISEL-NEXT:    and x8, x8, #0x1
+; GISEL-NEXT:    add x0, x8, #1
 ; GISEL-NEXT:    ret
   %cmp0 = icmp eq i64 %x1, 0
   %cmp1 = icmp eq i64 %x0, 2
@@ -552,14 +557,30 @@ define i64 @gccbug(i64 %x0, i64 %x1) {
 }
 
 define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
-; CHECK-LABEL: select_ororand:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    cmp w3, #4
-; CHECK-NEXT:    ccmp w2, #2, #0, gt
-; CHECK-NEXT:    ccmp w1, #13, #2, ge
-; CHECK-NEXT:    ccmp w0, #0, #4, ls
-; CHECK-NEXT:    csel w0, w3, wzr, eq
-; CHECK-NEXT:    ret
+; SDISEL-LABEL: select_ororand:
+; SDISEL:       ; %bb.0:
+; SDISEL-NEXT:    cmp w3, #4
+; SDISEL-NEXT:    ccmp w2, #2, #0, gt
+; SDISEL-NEXT:    ccmp w1, #13, #2, ge
+; SDISEL-NEXT:    ccmp w0, #0, #4, ls
+; SDISEL-NEXT:    csel w0, w3, wzr, eq
+; SDISEL-NEXT:    ret
+;
+; GISEL-LABEL: select_ororand:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cset w8, eq
+; GISEL-NEXT:    cmp w1, #13
+; GISEL-NEXT:    cset w9, hi
+; GISEL-NEXT:    cmp w2, #2
+; GISEL-NEXT:    cset w10, lt
+; GISEL-NEXT:    cmp w3, #4
+; GISEL-NEXT:    cset w11, gt
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, w3
+; GISEL-NEXT:    ret
   %c0 = icmp eq i32 %w0, 0
   %c1 = icmp ugt i32 %w1, 13
   %c2 = icmp slt i32 %w2, 2
@@ -592,7 +613,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ; SDISEL-LABEL: select_andor32:
 ; SDISEL:       ; %bb.0:
 ; SDISEL-NEXT:    cmp w1, w2
-; SDISEL-NEXT:    mov w8, #32
+; SDISEL-NEXT:    mov w8, #32 ; =0x20
 ; SDISEL-NEXT:    ccmp w0, w8, #4, lt
 ; SDISEL-NEXT:    ccmp w0, w1, #0, eq
 ; SDISEL-NEXT:    csel w0, w0, w1, eq
@@ -600,7 +621,7 @@ define i32 @select_andor32(i32 %v1, i32 %v2, i32 %v3) {
 ;
 ; GISEL-LABEL: select_andor32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov w8, #32
+; GISEL-NEXT:    mov w8, #32 ; =0x20
 ; GISEL-NEXT:    cmp w1, w2
 ; GISEL-NEXT:    ccmp w0, w8, #4, lt
 ; GISEL-NEXT:    ccmp w0, w1, #0, eq
@@ -631,18 +652,18 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; GISEL-LABEL: select_noccmp1:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    cmp x0, #0
-; GISEL-NEXT:    cset w8, lt
+; GISEL-NEXT:    cset w8, ge
 ; GISEL-NEXT:    cmp x0, #13
-; GISEL-NEXT:    cset w9, gt
+; GISEL-NEXT:    cset w9, le
 ; GISEL-NEXT:    cmp x2, #2
-; GISEL-NEXT:    cset w10, lt
+; GISEL-NEXT:    cset w10, ge
 ; GISEL-NEXT:    cmp x2, #4
-; GISEL-NEXT:    cset w11, gt
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    cset w11, le
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, xzr, x3, ne
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and x8, x8, #0x1
+; GISEL-NEXT:    and x0, x8, x3
 ; GISEL-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
@@ -677,11 +698,12 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
 ; GISEL-NEXT:    cmp x0, #13
 ; GISEL-NEXT:    cset w9, gt
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel x0, xzr, x3, ne
+; GISEL-NEXT:    eor w9, w8, #0x1
+; GISEL-NEXT:    and x9, x9, #0x1
 ; GISEL-NEXT:    sbfx w8, w8, #0, #1
-; GISEL-NEXT:    adrp x9, _g at PAGE
-; GISEL-NEXT:    str w8, [x9, _g at PAGEOFF]
+; GISEL-NEXT:    adrp x10, _g at PAGE
+; GISEL-NEXT:    str w8, [x10, _g at PAGEOFF]
+; GISEL-NEXT:    and x0, x9, x3
 ; GISEL-NEXT:    ret
   %c0 = icmp slt i64 %v1, 0
   %c1 = icmp sgt i64 %v1, 13
@@ -701,11 +723,11 @@ define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
 ; SDISEL-NEXT:    ccmp w0, #13, #0, ge
 ; SDISEL-NEXT:    cset w8, gt
 ; SDISEL-NEXT:    cmp w0, #22
-; SDISEL-NEXT:    mov w9, #44
+; SDISEL-NEXT:    mov w9, #44 ; =0x2c
 ; SDISEL-NEXT:    ccmp w0, w9, #0, ge
 ; SDISEL-NEXT:    csel w8, wzr, w8, le
 ; SDISEL-NEXT:    cmp w0, #99
-; SDISEL-NEXT:    mov w9, #77
+; SDISEL-NEXT:    mov w9, #77 ; =0x4d
 ; SDISEL-NEXT:    ccmp w0, w9, #4, ne
 ; SDISEL-NEXT:    cset w9, eq
 ; SDISEL-NEXT:    tst w8, w9
diff --git a/llvm/test/CodeGen/AArch64/call-rv-marker.ll b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
index fc06809ad09fb6..de8f5bbfb484d6 100644
--- a/llvm/test/CodeGen/AArch64/call-rv-marker.ll
+++ b/llvm/test/CodeGen/AArch64/call-rv-marker.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -o - %s | FileCheck --check-prefix=SELDAG --check-prefix=CHECK %s
 ; RUN: llc -global-isel -o - %s | FileCheck --check-prefix=GISEL --check-prefix=CHECK %s
 
@@ -25,37 +26,93 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 @fptr = dso_local global ptr null, align 8
 
 define dso_local ptr @rv_marker_1_retain() {
-; CHECK-LABEL: _rv_marker_1_retain:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_retain:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_retain:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local ptr @rv_marker_1_unsafeClaim() {
-; CHECK-LABEL: _rv_marker_1_unsafeClaim:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_1_unsafeClaim:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
 ;
+; GISEL-LABEL: rv_marker_1_unsafeClaim:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_unsafeClaimAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_unsafeClaimAutoreleasedReturnValue) ]
   ret ptr %call
 }
 
 define dso_local void @rv_marker_2_select(i32 %c) {
-; CHECK-LABEL: _rv_marker_2_select:
-; SELDAG:        cinc  w0, w8, eq
-; GISEL:         csinc w0, w8, wzr, eq
-; CHECK-NEXT:    bl _foo0
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:    ldp x29, x30, [sp], #16
-; CHECK-NEXT:    b _foo2
+; SELDAG-LABEL: rv_marker_2_select:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov w8, #1 ; =0x1
+; SELDAG-NEXT:    cmp w0, #0
+; SELDAG-NEXT:    cinc w0, w8, eq
+; SELDAG-NEXT:    bl _foo0
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _foo2
 ;
+; GISEL-LABEL: rv_marker_2_select:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov w8, #1 ; =0x1
+; GISEL-NEXT:    cmp w0, #0
+; GISEL-NEXT:    cinc w0, w8, eq
+; GISEL-NEXT:    bl _foo0
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _foo2
 entry:
   %tobool.not = icmp eq i32 %c, 0
   %.sink = select i1 %tobool.not, i32 2, i32 1
@@ -65,11 +122,121 @@ entry:
 }
 
 define dso_local void @rv_marker_3() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_3:
-; CHECK:         bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_3:
+; SELDAG:       Lfunc_begin0:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception0
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:  Ltmp0:
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp1:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    b _objc_release
+; SELDAG-NEXT:  LBB3_2: ; %lpad
+; SELDAG-NEXT:  Ltmp2:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end0:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table3:
+; SELDAG-NEXT:  Lexception0:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; SELDAG-NEXT:  Lcst_begin0:
+; SELDAG-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; SELDAG-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end0:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_3:
+; GISEL:       Lfunc_begin0:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception0
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:  Ltmp0:
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp1:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    b _objc_release
+; GISEL-NEXT:  LBB3_2: ; %lpad
+; GISEL-NEXT:  Ltmp2:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end0:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table3:
+; GISEL-NEXT:  Lexception0:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end0-Lcst_begin0
+; GISEL-NEXT:  Lcst_begin0:
+; GISEL-NEXT:    .uleb128 Lfunc_begin0-Lfunc_begin0 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; Call between Lfunc_begin0 and Ltmp0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp0-Lfunc_begin0 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp1-Ltmp0 ; Call between Ltmp0 and Ltmp1
+; GISEL-NEXT:    .uleb128 Ltmp2-Lfunc_begin0 ; jumps to Ltmp2
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp1-Lfunc_begin0 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end0-Ltmp1 ; Call between Ltmp1 and Lfunc_end0
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end0:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %call = call ptr @foo1() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   invoke void @objc_object(ptr %call) #5
@@ -87,13 +254,151 @@ lpad:                                             ; preds = %entry
 }
 
 define dso_local void @rv_marker_4() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: _rv_marker_4:
-; CHECK:       Ltmp3:
-; CHECK-NEXT:    bl _foo1
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
-; CHECK-NEXT:  Ltmp4:
+; SELDAG-LABEL: rv_marker_4:
+; SELDAG:       Lfunc_begin1:
+; SELDAG-NEXT:    .cfi_startproc
+; SELDAG-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; SELDAG-NEXT:    .cfi_lsda 16, Lexception1
+; SELDAG-NEXT:  ; %bb.0: ; %entry
+; SELDAG-NEXT:    sub sp, sp, #48
+; SELDAG-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 48
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Ltmp3:
+; SELDAG-NEXT:    bl _foo1
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:  Ltmp4:
+; SELDAG-NEXT:  ; %bb.1: ; %invoke.cont
+; SELDAG-NEXT:  Ltmp6:
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _objc_object
+; SELDAG-NEXT:  Ltmp7:
+; SELDAG-NEXT:  ; %bb.2: ; %invoke.cont2
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; SELDAG-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    add sp, sp, #48
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:  LBB4_3: ; %lpad1
+; SELDAG-NEXT:  Ltmp8:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    bl _objc_release
+; SELDAG-NEXT:    b LBB4_5
+; SELDAG-NEXT:  LBB4_4: ; %lpad
+; SELDAG-NEXT:  Ltmp5:
+; SELDAG-NEXT:    mov x20, x0
+; SELDAG-NEXT:  LBB4_5: ; %ehcleanup
+; SELDAG-NEXT:    add x0, sp, #15
+; SELDAG-NEXT:    bl __ZN1SD1Ev
+; SELDAG-NEXT:    mov x0, x20
+; SELDAG-NEXT:    bl __Unwind_Resume
+; SELDAG-NEXT:  Lfunc_end1:
+; SELDAG-NEXT:    .cfi_endproc
+; SELDAG-NEXT:    .section __TEXT,__gcc_except_tab
+; SELDAG-NEXT:    .p2align 2, 0x0
+; SELDAG-NEXT:  GCC_except_table4:
+; SELDAG-NEXT:  Lexception1:
+; SELDAG-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; SELDAG-NEXT:    .byte 255 ; @TType Encoding = omit
+; SELDAG-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; SELDAG-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; SELDAG-NEXT:  Lcst_begin1:
+; SELDAG-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; SELDAG-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; SELDAG-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; SELDAG-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; SELDAG-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; SELDAG-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; SELDAG-NEXT:    .byte 0 ; has no landing pad
+; SELDAG-NEXT:    .byte 0 ; On action: cleanup
+; SELDAG-NEXT:  Lcst_end1:
+; SELDAG-NEXT:    .p2align 2, 0x0
 ;
+; GISEL-LABEL: rv_marker_4:
+; GISEL:       Lfunc_begin1:
+; GISEL-NEXT:    .cfi_startproc
+; GISEL-NEXT:    .cfi_personality 155, ___gxx_personality_v0
+; GISEL-NEXT:    .cfi_lsda 16, Lexception1
+; GISEL-NEXT:  ; %bb.0: ; %entry
+; GISEL-NEXT:    sub sp, sp, #48
+; GISEL-NEXT:    stp x20, x19, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #32] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 48
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Ltmp3:
+; GISEL-NEXT:    bl _foo1
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:  Ltmp4:
+; GISEL-NEXT:  ; %bb.1: ; %invoke.cont
+; GISEL-NEXT:  Ltmp6:
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _objc_object
+; GISEL-NEXT:  Ltmp7:
+; GISEL-NEXT:  ; %bb.2: ; %invoke.cont2
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    add sp, sp, #48
+; GISEL-NEXT:    ret
+; GISEL-NEXT:  LBB4_3: ; %lpad1
+; GISEL-NEXT:  Ltmp8:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    bl _objc_release
+; GISEL-NEXT:    b LBB4_5
+; GISEL-NEXT:  LBB4_4: ; %lpad
+; GISEL-NEXT:  Ltmp5:
+; GISEL-NEXT:    mov x20, x0
+; GISEL-NEXT:  LBB4_5: ; %ehcleanup
+; GISEL-NEXT:    add x0, sp, #15
+; GISEL-NEXT:    bl __ZN1SD1Ev
+; GISEL-NEXT:    mov x0, x20
+; GISEL-NEXT:    bl __Unwind_Resume
+; GISEL-NEXT:  Lfunc_end1:
+; GISEL-NEXT:    .cfi_endproc
+; GISEL-NEXT:    .section __TEXT,__gcc_except_tab
+; GISEL-NEXT:    .p2align 2, 0x0
+; GISEL-NEXT:  GCC_except_table4:
+; GISEL-NEXT:  Lexception1:
+; GISEL-NEXT:    .byte 255 ; @LPStart Encoding = omit
+; GISEL-NEXT:    .byte 255 ; @TType Encoding = omit
+; GISEL-NEXT:    .byte 1 ; Call site Encoding = uleb128
+; GISEL-NEXT:    .uleb128 Lcst_end1-Lcst_begin1
+; GISEL-NEXT:  Lcst_begin1:
+; GISEL-NEXT:    .uleb128 Ltmp3-Lfunc_begin1 ; >> Call Site 1 <<
+; GISEL-NEXT:    .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4
+; GISEL-NEXT:    .uleb128 Ltmp5-Lfunc_begin1 ; jumps to Ltmp5
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp6-Lfunc_begin1 ; >> Call Site 2 <<
+; GISEL-NEXT:    .uleb128 Ltmp7-Ltmp6 ; Call between Ltmp6 and Ltmp7
+; GISEL-NEXT:    .uleb128 Ltmp8-Lfunc_begin1 ; jumps to Ltmp8
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:    .uleb128 Ltmp7-Lfunc_begin1 ; >> Call Site 3 <<
+; GISEL-NEXT:    .uleb128 Lfunc_end1-Ltmp7 ; Call between Ltmp7 and Lfunc_end1
+; GISEL-NEXT:    .byte 0 ; has no landing pad
+; GISEL-NEXT:    .byte 0 ; On action: cleanup
+; GISEL-NEXT:  Lcst_end1:
+; GISEL-NEXT:    .p2align 2, 0x0
 entry:
   %s = alloca %struct.S, align 1
   call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %s) #2
@@ -129,11 +434,53 @@ ehcleanup:                                        ; preds = %lpad1, %lpad
 }
 
 define dso_local ptr @rv_marker_5_indirect_call() {
-; CHECK-LABEL: _rv_marker_5_indirect_call:
-; CHECK:         ldr [[ADDR:x[0-9]+]], [
-; CHECK-NEXT:    blr [[ADDR]]
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_5_indirect_call:
+; SELDAG:       ; %bb.0: ; %entry
+; SELDAG-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 32
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    .cfi_offset w19, -24
+; SELDAG-NEXT:    .cfi_offset w20, -32
+; SELDAG-NEXT:  Lloh0:
+; SELDAG-NEXT:    adrp x8, _fptr at PAGE
+; SELDAG-NEXT:  Lloh1:
+; SELDAG-NEXT:    ldr x8, [x8, _fptr at PAGEOFF]
+; SELDAG-NEXT:    blr x8
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    mov x19, x0
+; SELDAG-NEXT:    bl _foo2
+; SELDAG-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; SELDAG-NEXT:    mov x0, x19
+; SELDAG-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+; SELDAG-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+;
+; GISEL-LABEL: rv_marker_5_indirect_call:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 32
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    .cfi_offset w19, -24
+; GISEL-NEXT:    .cfi_offset w20, -32
+; GISEL-NEXT:  Lloh0:
+; GISEL-NEXT:    adrp x8, _fptr at PAGE
+; GISEL-NEXT:  Lloh1:
+; GISEL-NEXT:    ldr x8, [x8, _fptr at PAGEOFF]
+; GISEL-NEXT:    blr x8
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    mov x19, x0
+; GISEL-NEXT:    bl _foo2
+; GISEL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x0, x19
+; GISEL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
+; GISEL-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 entry:
   %0 = load ptr, ptr @fptr, align 8
   %call = call ptr %0() [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
@@ -144,13 +491,35 @@ entry:
 declare ptr @foo(i64, i64, i64)
 
 define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
-; CHECK-LABEL: _rv_marker_multiarg:
-; CHECK:         mov [[TMP:x[0-9]+]], x0
-; CHECK-NEXT:    mov x0, x2
-; CHECK-NEXT:    mov x2, [[TMP]]
-; CHECK-NEXT:    bl  _foo
-; CHECK-NEXT:    mov x29, x29
-; CHECK-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-LABEL: rv_marker_multiarg:
+; SELDAG:       ; %bb.0:
+; SELDAG-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SELDAG-NEXT:    .cfi_def_cfa_offset 16
+; SELDAG-NEXT:    .cfi_offset w30, -8
+; SELDAG-NEXT:    .cfi_offset w29, -16
+; SELDAG-NEXT:    mov x8, x0
+; SELDAG-NEXT:    mov x0, x2
+; SELDAG-NEXT:    mov x2, x8
+; SELDAG-NEXT:    bl _foo
+; SELDAG-NEXT:    mov x29, x29
+; SELDAG-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; SELDAG-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; SELDAG-NEXT:    ret
+;
+; GISEL-LABEL: rv_marker_multiarg:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -8
+; GISEL-NEXT:    .cfi_offset w29, -16
+; GISEL-NEXT:    mov x3, x0
+; GISEL-NEXT:    mov x0, x2
+; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    bl _foo
+; GISEL-NEXT:    mov x29, x29
+; GISEL-NEXT:    bl _objc_retainAutoreleasedReturnValue
+; GISEL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; GISEL-NEXT:    ret
   call ptr @foo(i64 %c, i64 %b, i64 %a) [ "clang.arc.attachedcall"(ptr @objc_retainAutoreleasedReturnValue) ]
   ret void
 }
@@ -158,3 +527,5 @@ define dso_local void @rv_marker_multiarg(i64 %a, i64 %b, i64 %c) {
 declare ptr @objc_retainAutoreleasedReturnValue(ptr)
 declare ptr @objc_unsafeClaimAutoreleasedReturnValue(ptr)
 declare i32 @__gxx_personality_v0(...)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index f17b9724aadba3..0d02cc170a98b7 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1100,9 +1100,7 @@ define <8 x i8> @vselect_constant_cond_zero_v8i8(<8 x i8> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI83_0
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI83_0]
-; CHECK-GI-NEXT:    shl v1.8b, v1.8b, #7
-; CHECK-GI-NEXT:    sshr v1.8b, v1.8b, #7
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %b = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> zeroinitializer
   ret <8 x i8> %b
@@ -1119,9 +1117,7 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI84_0
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI84_0]
-; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #15
-; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #15
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> %a, <4 x i16> zeroinitializer
   ret <4 x i16> %b
@@ -1139,12 +1135,12 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI85_1
 ; CHECK-GI-NEXT:    adrp x9, .LCPI85_0
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI85_1]
-; CHECK-GI-NEXT:    ldr d2, [x9, :lo12:.LCPI85_0]
-; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
-; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI85_1]
+; CHECK-GI-NEXT:    ldr d3, [x9, :lo12:.LCPI85_0]
+; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a, <4 x i32> zeroinitializer
   ret <4 x i32> %b
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
index 9f3ad8b4444462..d60dbe3c8436e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
@@ -450,8 +450,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %zero, %variable
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -476,7 +477,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %and:_(s32) = G_SELECT %cond(s1), %variable, %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -500,9 +502,9 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %variable, %neg1
+    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(ne), %reg(s32), %zero
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -527,8 +529,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %or:_(s32) = G_SELECT %cond(s1), %neg1, %variable
+    ; CHECK-NEXT: %select:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -549,15 +551,8 @@ body: |
     ; CHECK-LABEL: name: fold_and_variable_into_select_undef_neg1_s32
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %undef, %neg1
-    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
-    ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit %variable(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
@@ -582,9 +577,8 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %undef, %zero
-    ; CHECK-NEXT: %and:_(s32) = G_AND %select, %variable
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT %cond(s1)
+    ; CHECK-NEXT: %and:_(s32) = G_AND [[ZEXT]], %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %and(s32)
     %reg:_(s32) = COPY $vgpr0
     %variable:_(s32) = COPY $vgpr0
@@ -667,9 +661,9 @@ body: |
     ; CHECK-NEXT: %variable:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: %neg1:_(s32) = G_CONSTANT i32 -1
     ; CHECK-NEXT: %otherconst:_(s32) = G_CONSTANT i32 123
-    ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %neg1, %otherconst
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT %cond(s1)
+    ; CHECK-NEXT: %select:_(s32) = G_OR [[SEXT]], %otherconst
     ; CHECK-NEXT: %or:_(s32) = G_OR %select, %variable
     ; CHECK-NEXT: S_ENDPGM 0, implicit %or(s32)
     %reg:_(s32) = COPY $vgpr0
@@ -749,8 +743,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %srem:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %srem:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %srem(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
@@ -802,8 +795,7 @@ body: |
     ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: %udiv:_(s32) = G_SELECT %cond(s1), [[C]], %zero
+    ; CHECK-NEXT: %udiv:_(s32) = G_ZEXT %cond(s1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit %udiv(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index ccf4e84fbbbd16..4ac1fad6deecdc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -37,7 +37,8 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch(ptr addrspace(1) %out, i3
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX10-NEXT:  .LBB0_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -72,7 +73,8 @@ define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %
 ; GFX10-NEXT:  .LBB1_2: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  .LBB1_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index afd271c9957700..c1f3924e466d57 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -14,7 +14,8 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -51,7 +52,8 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
 ; GFX10-NEXT:  ; %bb.4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index f9b98059be0b3a..d23f33c90c1d68 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -5758,42 +5758,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX6-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s10
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[0:1], s11
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX6-NEXT:    s_bfe_u64 s[18:19], s[14:15], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
+; GFX6-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_mov_b32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT:    s_sub_i32 s12, s8, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s8
+; GFX6-NEXT:    s_sub_i32 s7, s8, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s8
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[4:5], s12
+; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -5804,42 +5805,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX8-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s10
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[0:1], s11
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX8-NEXT:    s_bfe_u64 s[18:19], s[14:15], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
+; GFX8-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_mov_b32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT:    s_sub_i32 s12, s8, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s8
+; GFX8-NEXT:    s_sub_i32 s7, s8, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s8
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[4:5], s12
+; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -5850,42 +5852,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX9-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s10
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[0:1], s11
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
+; GFX9-NEXT:    s_bfe_u64 s[18:19], s[14:15], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
+; GFX9-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_mov_b32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT:    s_sub_i32 s12, s8, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s8
+; GFX9-NEXT:    s_sub_i32 s7, s8, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s8
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[4:5], s12
+; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[6:7], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -5896,42 +5899,43 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_mov_b32 s12, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_mov_b32 s14, 0
+; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[2:3], s10
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_bfe_u64 s[18:19], s[12:13], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_and_b64 s[10:11], s[18:19], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX10-NEXT:    s_lshl_b32 s15, s6, 31
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT:    s_sub_i32 s14, s8, 64
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[14:15]
+; GFX10-NEXT:    s_sub_i32 s7, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s8
+; GFX10-NEXT:    s_lshl_b64 s[14:15], s[4:5], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX10-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -5942,42 +5946,44 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_mov_b32 s12, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_mov_b32 s14, 0
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[2:3], s10
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_bfe_u64 s[18:19], s[12:13], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT:    s_and_b64 s[10:11], s[18:19], s[10:11]
+; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX11-NEXT:    s_lshl_b32 s15, s6, 31
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT:    s_sub_i32 s14, s8, 64
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[14:15]
+; GFX11-NEXT:    s_sub_i32 s7, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s8
+; GFX11-NEXT:    s_lshl_b64 s[14:15], s[4:5], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
+; GFX11-NEXT:    s_bfe_u64 s[4:5], s[6:7], 0x10000
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6000,37 +6006,35 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v15
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v15
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v15
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], v6
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], v15
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v14
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v6
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[6:7], 1
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v15
+; GFX6-NEXT:    v_subrev_i32_e32 v13, vcc, 64, v15
+; GFX6-NEXT:    v_lshr_b64 v[6:7], v[0:1], v15
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v8
+; GFX6-NEXT:    v_and_b32_e32 v12, v10, v12
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v15
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v13
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v12, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v10
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i128:
@@ -6048,37 +6052,35 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v6
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v15
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v15
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v6
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[6:7]
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v15
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, 64, v15
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v15, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v12, v10, v12
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v15, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v13, v[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v12, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i128:
@@ -6096,22 +6098,21 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v8, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v8, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[6:7]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 31, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v15
-; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v15
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v15, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v16, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
@@ -6120,12 +6121,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v10, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v12, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v0, v10, v12, v0
+; GFX9-NEXT:    v_and_or_b32 v1, 0, v13, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v8, v11
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v9, v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i128:
@@ -6151,79 +6151,79 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
 ; GFX10-NEXT:    v_or_b32_e32 v11, v11, v9
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX10-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v6, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v18
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v8, v14, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v9, v15, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v19, v[12:13]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, v2, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s5
+; GFX10-NEXT:    v_and_or_b32 v0, v20, v6, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0, v7, v4
+; GFX10-NEXT:    v_and_or_b32 v2, v5, v8, v2
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v9, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v8
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_not_b32_e32 v8, v8
 ; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 31, v5
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v18, v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v14, v14, v16
 ; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v6, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v0, v2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v8, v14, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v11, v9, v15, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v19, v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v9, v3
+; GFX11-NEXT:    v_and_or_b32 v0, v20, v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_or_b32 v1, 0, v7, v4
+; GFX11-NEXT:    v_and_or_b32 v2, v5, v8, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   ret i128 %result
@@ -6234,19 +6234,18 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v6
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v6
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
+; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 64, v6
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v8
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[0:1], v7
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[0:1], v6
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX6-NEXT:    s_mov_b32 s8, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s2
@@ -6254,51 +6253,49 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT:    v_and_b32_e32 v9, v5, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v7
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v7
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v7
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[2:3], v7
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s0
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v8
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
+; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 64, v8
+; GFX6-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[2:3], v10
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[6:7], s[2:3], v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v4, s0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX6-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_ssv:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v6
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 64, v6
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX8-NEXT:    s_mov_b32 s8, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
@@ -6306,51 +6303,49 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT:    v_and_b32_e32 v9, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v7
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v8, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 64, v8
+; GFX8-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v10, s[2:3]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v0, v9, v0
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_ssv:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v6
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v6
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 64, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, s[0:1]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -6358,137 +6353,130 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s6, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v8, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v12, 64, v8
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v12, s[2:3]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v8, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v0, v9, v4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, 0, v5, v1
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v6, v10
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v7, v11
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_ssv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
+; GFX10-NEXT:    v_and_b32_e32 v10, 0x7f, v0
 ; GFX10-NEXT:    v_not_b32_e32 v0, v0
 ; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_and_b32_e32 v13, 0x7f, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX10-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v10
+; GFX10-NEXT:    v_and_b32_e32 v11, 0x7f, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v10, s[2:3]
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v11
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, 64, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v11, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v10, s[0:1]
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v10
+; GFX10-NEXT:    v_or_b32_e32 v13, v2, v0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 64, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX10-NEXT:    v_or_b32_e32 v14, v3, v1
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[0:1]
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v11
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v6, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v2, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v10
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v11, s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v0, s2, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, s4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v1, s3, s1
+; GFX10-NEXT:    v_and_or_b32 v0, v12, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0, v5, v6
+; GFX10-NEXT:    v_and_or_b32 v2, v7, v2, v8
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v9
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i128_ssv:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
+; GFX11-NEXT:    v_and_b32_e32 v10, 0x7f, v0
 ; GFX11-NEXT:    v_not_b32_e32 v0, v0
 ; GFX11-NEXT:    s_mov_b32 s8, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX11-NEXT:    s_lshl_b32 s9, s6, 31
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, s[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v10
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0x7f, v0
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, s[2:3]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v12, 64, v10
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v13, s[8:9]
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v13, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v14, v3, v1
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v11
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 64, v11
+; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v11, s[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v10, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[6:7]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v10
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v0, 64, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v7, v7, v9
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v8, s2, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s8, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX11-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v0, s2, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, v1, s3, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v2, v6, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v3, v7, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v11, s[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, s4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v7, s5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v9
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_and_or_b32 v0, v12, v4, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0, v5, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT:    v_and_or_b32 v2, v7, v2, v8
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6503,49 +6491,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX6-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s6
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[0:1], s7
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX6-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s0, s4, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s4
+; GFX6-NEXT:    s_sub_i32 s3, s4, 64
+; GFX6-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s5
 ; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s5
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s2
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s6
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX6-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX6-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_svs:
@@ -6555,49 +6545,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX8-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s6
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[0:1], s7
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX8-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s0, s4, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s4
+; GFX8-NEXT:    s_sub_i32 s3, s4, 64
+; GFX8-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX8-NEXT:    s_and_b32 s3, 1, s2
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s6
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_and_b32 s3, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX8-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_svs:
@@ -6607,48 +6599,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX9-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s6
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[0:1], s7
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX9-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX9-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s0, s4, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s4
+; GFX9-NEXT:    s_sub_i32 s3, s4, 64
+; GFX9-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s5, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s5
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX9-NEXT:    s_and_b32 s3, 1, s2
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_and_b32 s3, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_svs:
@@ -6659,45 +6653,45 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_and_b64 s[6:7], s[12:13], s[6:7]
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s2, 64, s4
+; GFX10-NEXT:    s_sub_i32 s3, s4, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s4
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s4, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    s_and_b32 s3, 1, s2
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s5
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_and_b32 s3, 1, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX10-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v0, s6, v0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -6710,49 +6704,46 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_bfe_u64 s[12:13], s[8:9], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_and_b64 s[6:7], s[12:13], s[6:7]
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX11-NEXT:    s_sub_i32 s2, 64, s4
+; GFX11-NEXT:    s_sub_i32 s3, s4, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s4, 64
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
+; GFX11-NEXT:    s_and_b32 s3, 1, s2
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s5
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s4, v[2:3]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_and_b32 s3, 1, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX11-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX11-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6768,47 +6759,48 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX6-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX6-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s7
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s6
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s10
+; GFX6-NEXT:    s_mov_b32 s8, 0
+; GFX6-NEXT:    s_bfe_u64 s[6:7], s[10:11], 0x10000
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s9
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX6-NEXT:    s_and_b32 s5, 1, s10
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX6-NEXT:    v_and_b32_e32 v7, s7, v9
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s10, s4, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s4
+; GFX6-NEXT:    s_sub_i32 s7, s4, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    v_and_b32_e32 v6, s6, v8
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6822,47 +6814,48 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX8-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX8-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    s_mov_b32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX8-NEXT:    s_and_b32 s5, 1, s10
+; GFX8-NEXT:    s_mov_b32 s8, 0
+; GFX8-NEXT:    s_bfe_u64 s[6:7], s[10:11], 0x10000
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s9
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX8-NEXT:    s_and_b32 s5, 1, s10
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, s7, v9
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_sub_i32 s10, s4, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s4
+; GFX8-NEXT:    s_sub_i32 s7, s4, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    v_and_b32_e32 v6, s6, v8
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6876,47 +6869,48 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX9-NEXT:    s_sub_i32 s5, s6, 64
 ; GFX9-NEXT:    s_sub_i32 s7, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
+; GFX9-NEXT:    s_and_b32 s5, 1, s10
+; GFX9-NEXT:    s_mov_b32 s8, 0
+; GFX9-NEXT:    s_bfe_u64 s[6:7], s[10:11], 0x10000
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s9
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
-; GFX9-NEXT:    s_and_b32 s5, 1, s10
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, s7, v9
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_sub_i32 s10, s4, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s4
+; GFX9-NEXT:    s_sub_i32 s7, s4, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    v_and_b32_e32 v6, s6, v8
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v0, s0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
@@ -6927,52 +6921,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_sub_i32 s5, 64, s6
+; GFX10-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX10-NEXT:    s_and_b32 s6, 1, s8
+; GFX10-NEXT:    s_mov_b32 s10, 0
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_and_b32 s9, 1, s8
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
-; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    s_and_b32 s5, 1, s9
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT:    s_lshl_b32 s11, s2, 31
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX10-NEXT:    s_bfe_u64 s[6:7], s[8:9], 0x10000
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s9
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_sub_i32 s10, s4, 64
-; GFX10-NEXT:    s_sub_i32 s8, 64, s4
+; GFX10-NEXT:    s_sub_i32 s9, s4, 64
+; GFX10-NEXT:    s_sub_i32 s12, 64, s4
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[10:11], s[2:3]
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[8:9], 0x10000
+; GFX10-NEXT:    v_and_or_b32 v0, s6, v8, s0
+; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    v_and_or_b32 v1, s7, v9, s1
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -6981,49 +6974,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_sub_i32 s5, 64, s6
+; GFX11-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX11-NEXT:    s_and_b32 s6, 1, s8
+; GFX11-NEXT:    s_mov_b32 s10, 0
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_and_b32 s9, 1, s8
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    s_lshl_b32 s7, s2, 31
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    s_and_b32 s5, 1, s9
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX11-NEXT:    s_lshl_b32 s11, s2, 31
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX11-NEXT:    s_bfe_u64 s[6:7], s[8:9], 0x10000
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s9
+; GFX11-NEXT:    s_and_b32 s5, 1, s5
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_sub_i32 s10, s4, 64
-; GFX11-NEXT:    s_sub_i32 s8, 64, s4
+; GFX11-NEXT:    s_sub_i32 s9, s4, 64
+; GFX11-NEXT:    s_sub_i32 s12, 64, s4
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
-; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s4
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s9
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[10:11], s[2:3]
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[8:9], 0x10000
+; GFX11-NEXT:    v_and_or_b32 v0, s6, v8, s0
+; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_and_or_b32 v1, s7, v9, s1
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -7160,83 +7154,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX6-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX6-NEXT:    s_lshr_b64 s[28:29], s[0:1], s19
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX6-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
+; GFX6-NEXT:    s_or_b64 s[18:19], s[28:29], s[18:19]
+; GFX6-NEXT:    s_bfe_u64 s[28:29], s[24:25], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX6-NEXT:    s_and_b64 s[26:27], s[28:29], s[26:27]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_mov_b32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
 ; GFX6-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT:    s_sub_i32 s23, s16, 64
-; GFX6-NEXT:    s_sub_i32 s18, 64, s16
+; GFX6-NEXT:    s_sub_i32 s11, s16, 64
+; GFX6-NEXT:    s_sub_i32 s23, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[8:9], s23
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT:    s_or_b64 s[0:1], s[26:27], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[4:5], s8
+; GFX6-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX6-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX6-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX6-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX6-NEXT:    s_sub_i32 s18, s10, 64
-; GFX6-NEXT:    s_sub_i32 s14, 64, s10
+; GFX6-NEXT:    s_sub_i32 s13, s10, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[8:9], s10
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s13
+; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[12:13], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[14:15]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[18:19], s[4:5]
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -7247,83 +7243,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX8-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX8-NEXT:    s_lshr_b64 s[28:29], s[0:1], s19
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX8-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
+; GFX8-NEXT:    s_or_b64 s[18:19], s[28:29], s[18:19]
+; GFX8-NEXT:    s_bfe_u64 s[28:29], s[24:25], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX8-NEXT:    s_and_b64 s[26:27], s[28:29], s[26:27]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_mov_b32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
 ; GFX8-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT:    s_sub_i32 s23, s16, 64
-; GFX8-NEXT:    s_sub_i32 s18, 64, s16
+; GFX8-NEXT:    s_sub_i32 s11, s16, 64
+; GFX8-NEXT:    s_sub_i32 s23, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[8:9], s23
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[26:27], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[4:5], s8
+; GFX8-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX8-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX8-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX8-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX8-NEXT:    s_sub_i32 s18, s10, 64
-; GFX8-NEXT:    s_sub_i32 s14, 64, s10
+; GFX8-NEXT:    s_sub_i32 s13, s10, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[8:9], s10
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s13
+; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[12:13], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[14:15]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[18:19], s[4:5]
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -7334,83 +7332,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX9-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
+; GFX9-NEXT:    s_lshr_b64 s[28:29], s[0:1], s19
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
-; GFX9-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
+; GFX9-NEXT:    s_or_b64 s[18:19], s[28:29], s[18:19]
+; GFX9-NEXT:    s_bfe_u64 s[28:29], s[24:25], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
+; GFX9-NEXT:    s_and_b64 s[26:27], s[28:29], s[26:27]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_mov_b32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
 ; GFX9-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT:    s_sub_i32 s23, s16, 64
-; GFX9-NEXT:    s_sub_i32 s18, 64, s16
+; GFX9-NEXT:    s_sub_i32 s11, s16, 64
+; GFX9-NEXT:    s_sub_i32 s23, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[8:9], s23
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT:    s_or_b64 s[0:1], s[26:27], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[4:5], s8
+; GFX9-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX9-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
+; GFX9-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
 ; GFX9-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
-; GFX9-NEXT:    s_sub_i32 s18, s10, 64
-; GFX9-NEXT:    s_sub_i32 s14, 64, s10
+; GFX9-NEXT:    s_sub_i32 s13, s10, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[12:13], s[8:9], s10
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[8:9], s10
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[8:9], s14
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s13
+; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[12:13], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[14:15]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[18:19], s[4:5]
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -7421,83 +7421,85 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX10-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX10-NEXT:    s_mov_b32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_mov_b32 s24, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX10-NEXT:    s_lshl_b64 s[28:29], s[2:3], s18
 ; GFX10-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX10-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX10-NEXT:    s_bfe_u64 s[28:29], s[22:23], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX10-NEXT:    s_and_b64 s[18:19], s[28:29], s[18:19]
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX10-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX10-NEXT:    s_lshl_b32 s25, s10, 31
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s23, s16, 64
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
+; GFX10-NEXT:    s_sub_i32 s11, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s16
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[8:9], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX10-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
+; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX10-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX10-NEXT:    s_lshl_b32 s25, s14, 31
 ; GFX10-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX10-NEXT:    s_sub_i32 s18, s10, 64
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[24:25]
+; GFX10-NEXT:    s_sub_i32 s15, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
+; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s10
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[12:13], s11
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[12:13], s15
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b64 s[12:13], s[16:17], s[12:13]
 ; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX10-NEXT:    s_bfe_u64 s[12:13], s[14:15], 0x10000
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -7508,83 +7510,87 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX11-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX11-NEXT:    s_mov_b32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_mov_b32 s24, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX11-NEXT:    s_lshl_b64 s[28:29], s[2:3], s18
 ; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX11-NEXT:    s_or_b64 s[26:27], s[26:27], s[28:29]
+; GFX11-NEXT:    s_bfe_u64 s[28:29], s[22:23], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
+; GFX11-NEXT:    s_and_b64 s[18:19], s[28:29], s[18:19]
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[26:27], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
-; GFX11-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX11-NEXT:    s_lshl_b32 s25, s10, 31
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s23, s16, 64
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
+; GFX11-NEXT:    s_sub_i32 s11, s16, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s16
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[8:9], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[26:27]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s11
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
+; GFX11-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
 ; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
+; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
+; GFX11-NEXT:    s_bfe_u64 s[20:21], s[16:17], 0x10000
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[18:19], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX11-NEXT:    s_lshl_b32 s25, s14, 31
 ; GFX11-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX11-NEXT:    s_sub_i32 s18, s10, 64
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[24:25]
+; GFX11-NEXT:    s_sub_i32 s15, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[4:5], s10
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[12:13], s11
+; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s10
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[12:13], s11
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[12:13], s10
-; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[12:13], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[12:13], s15
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX11-NEXT:    s_cselect_b64 s[12:13], s[16:17], s[12:13]
 ; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[12:13]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
+; GFX11-NEXT:    s_bfe_u64 s[12:13], s[14:15], 0x10000
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_and_b64 s[10:11], s[12:13], s[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
@@ -7607,39 +7613,38 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX6-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[10:11], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v24
-; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v24
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[0:1], v24
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v10
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[2:3], v24
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v23
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 31, v10
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[10:11], 1
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v24
+; GFX6-NEXT:    v_subrev_i32_e32 v22, vcc, 64, v24
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[0:1], v24
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v16
+; GFX6-NEXT:    v_and_b32_e32 v21, v18, v21
+; GFX6-NEXT:    v_lshr_b64 v[18:19], v[8:9], v24
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[8:9], v22
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v16
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX6-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX6-NEXT:    v_not_b32_e32 v8, v20
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v18
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v18
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v8
 ; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v18
@@ -7648,39 +7653,36 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], 1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
-; GFX6-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 64, v19
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v19
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[4:5], v19
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT:    v_lshr_b64 v[12:13], v[6:7], v19
-; GFX6-NEXT:    v_lshr_b64 v[6:7], v[6:7], v14
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 31, v14
+; GFX6-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[14:15], 1
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 64, v19
+; GFX6-NEXT:    v_and_b32_e32 v16, v10, v16
+; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, 64, v19
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v19
+; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v12
+; GFX6-NEXT:    v_lshr_b64 v[14:15], v[8:9], v19
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[8:9], v17
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v12
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX6-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX6-NEXT:    v_or_b32_e32 v11, v11, v13
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v14
+; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
 ; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX6-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i128:
@@ -7698,39 +7700,38 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX8-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 31, v10
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v24
-; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v24
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
-; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 31, v10
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[10:11]
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v24
+; GFX8-NEXT:    v_subrev_u32_e32 v22, vcc, 64, v24
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v24, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[8:9]
+; GFX8-NEXT:    v_and_b32_e32 v21, v18, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v22, v[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v10, v10, v16
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX8-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX8-NEXT:    v_not_b32_e32 v8, v20
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
 ; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v18
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
@@ -7739,39 +7740,36 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 31, v14
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 64, v19
-; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v19
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v19, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v14, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 31, v14
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], 1, v[14:15]
+; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 64, v19
+; GFX8-NEXT:    v_and_b32_e32 v16, v10, v16
+; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, 64, v19
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v19, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v12, v[8:9]
+; GFX8-NEXT:    v_lshrrev_b64 v[14:15], v19, v[8:9]
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v17, v[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v10, v10, v12
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX8-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX8-NEXT:    v_or_b32_e32 v11, v11, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v14
+; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX8-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i128:
@@ -7789,54 +7787,52 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_or_b32_e32 v16, v16, v18
 ; GFX9-NEXT:    v_or_b32_e32 v17, v17, v19
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, 0, v22, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, v1, v17, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v0, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v16, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v16, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[10:11]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 31, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
-; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v24
+; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v25, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v0, v18, v21, v0
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
 ; GFX9-NEXT:    v_not_b32_e32 v8, v20
-; GFX9-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v16, v19
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v18
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v18
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v17, v23
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v18, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v5, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v4, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v8, v7, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[6:7], 1, v[14:15]
 ; GFX9-NEXT:    v_lshl_or_b32 v5, v14, 31, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v19
@@ -7853,199 +7849,187 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v19
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX9-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v22, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX9-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, 0, v22, v1
+; GFX9-NEXT:    v_and_or_b32 v4, v20, v16, v4
+; GFX9-NEXT:    v_and_or_b32 v5, 0, v17, v5
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v12, v18
+; GFX9-NEXT:    v_and_or_b32 v7, 0, v13, v21
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT:    v_not_b32_e32 v16, v16
+; GFX10-NEXT:    v_not_b32_e32 v21, v16
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v28, 0x7f, v21
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v27
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
-; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX10-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
-; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v21, v[0:1]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v29, 64, v28
+; GFX10-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v28
 ; GFX10-NEXT:    v_or_b32_e32 v19, v17, v19
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v28
-; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s5, 64, v28
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v27
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v21, v16, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[25:26], v18, v[10:11]
+; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v29, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v22, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v27, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v23, v23, v25
-; GFX10-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, v19, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX10-NEXT:    v_not_b32_e32 v3, v20
-; GFX10-NEXT:    v_or_b32_e32 v1, v22, v8
+; GFX10-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v21, v2, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, v19, v3, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v28, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v17, v24, s5
+; GFX10-NEXT:    v_and_b32_e32 v23, 0x7f, v20
+; GFX10-NEXT:    v_not_b32_e32 v11, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v23
+; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v11
+; GFX10-NEXT:    v_and_or_b32 v2, v10, v2, v21
+; GFX10-NEXT:    v_and_or_b32 v0, v18, v0, v8
+; GFX10-NEXT:    v_and_or_b32 v1, 0, v1, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT:    v_and_b32_e32 v22, 0x7f, v3
-; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
-; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v16, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[12:13], v23, v[6:7]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v25, 64, v23
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v23, v[4:5]
 ; GFX10-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX10-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
-; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v24
+; GFX10-NEXT:    v_or_b32_e32 v13, v11, v13
+; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v25, v[4:5]
 ; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX10-NEXT:    v_or_b32_e32 v5, v11, v13
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v22
-; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v22
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v24
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
-; GFX10-NEXT:    v_or_b32_e32 v3, v23, v25
-; GFX10-NEXT:    v_or_b32_e32 v4, v13, v5
-; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v23
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v24
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX10-NEXT:    v_or_b32_e32 v18, v18, v20
+; GFX10-NEXT:    v_or_b32_e32 v19, v19, v21
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v10, v18, s4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v11, v19, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v24, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v4, v6, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT:    v_and_or_b32 v7, 0, v11, v7
+; GFX10-NEXT:    v_and_or_b32 v4, v25, v16, v4
+; GFX10-NEXT:    v_and_or_b32 v5, 0, v17, v8
+; GFX10-NEXT:    v_and_or_b32 v6, v9, v10, v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT:    v_not_b32_e32 v16, v16
+; GFX11-NEXT:    v_not_b32_e32 v21, v16
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
+; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22
-; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v27
-; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v27, v[2:3]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v25, 64, v28
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v27
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
 ; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v28
-; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v25, v[10:11]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v29, v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v28
-; GFX11-NEXT:    v_or_b32_e32 v18, v16, v18
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v16, 64, v28
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v27
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX11-NEXT:    v_and_b32_e32 v28, 0x7f, v21
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v27
 ; GFX11-NEXT:    v_or_b32_e32 v19, v17, v19
+; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v21, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v27, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v21, v21, v16, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v28
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v29, 64, v28
+; GFX11-NEXT:    v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s1, 64, v28
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, v21, v2, s0
+; GFX11-NEXT:    v_lshlrev_b64 v[25:26], v18, v[10:11]
+; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v29, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v23, v23, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v0, v24, v26
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
-; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v20
-; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v0, s0
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v16, v8, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v17, v9, s1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 64, v24
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v21, v3
-; GFX11-NEXT:    v_not_b32_e32 v3, v20
-; GFX11-NEXT:    v_or_b32_e32 v1, v22, v8
+; GFX11-NEXT:    v_or_b32_e32 v24, v24, v26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v16, v16, v23, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v19, v22, v19, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v17, v17, v24, s1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
+; GFX11-NEXT:    v_and_b32_e32 v23, 0x7f, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, v19, v3, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v28, v[10:11]
+; GFX11-NEXT:    v_not_b32_e32 v11, v20
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc_lo
+; GFX11-NEXT:    v_and_or_b32 v0, v18, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v23
+; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v11
+; GFX11-NEXT:    v_and_or_b32 v1, 0, v1, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v11, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 0x7f, v3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT:    v_and_or_b32 v2, v10, v2, v21
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v16, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[12:13], v23, v[6:7]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v24
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v25, 64, v23
 ; GFX11-NEXT:    v_lshl_or_b32 v9, v14, 31, v9
 ; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 1, v[14:15]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v23, v[4:5]
 ; GFX11-NEXT:    v_or_b32_e32 v12, v10, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT:    v_or_b32_e32 v5, v11, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0, v16, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v22
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v22
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v22, v[8:9]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v3, v12, vcc_lo
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v24
+; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v24, v[8:9]
+; GFX11-NEXT:    v_or_b32_e32 v13, v11, v13
 ; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v25, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v22, v[14:15]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v24
-; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, v18, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s0
-; GFX11-NEXT:    v_or_b32_e32 v3, v23, v25
-; GFX11-NEXT:    v_or_b32_e32 v4, v13, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v23
+; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v24
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v23
+; GFX11-NEXT:    v_or_b32_e32 v18, v18, v20
+; GFX11-NEXT:    v_or_b32_e32 v19, v19, v21
+; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v12 :: v_dual_cndmask_b32 v5, v5, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v12, v10, v18, s0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
+; GFX11-NEXT:    v_cndmask_b32_e64 v13, v11, v19, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v24, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v4, v6, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s1
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX11-NEXT:    v_and_or_b32 v7, 0, v11, v7
+; GFX11-NEXT:    v_and_or_b32 v4, v25, v16, v4
+; GFX11-NEXT:    v_and_or_b32 v5, 0, v17, v8
+; GFX11-NEXT:    v_and_or_b32 v6, v9, v10, v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index c8455665e7b40f..5be266e141edf5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -5814,40 +5814,37 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s11, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_sub_i32 s2, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX6-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
+; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[12:13], s2
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s14, s10, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s10
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[12:13], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s9, s10, 64
+; GFX6-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[12:13], s[6:7], s10
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[6:7], s11
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
@@ -5860,40 +5857,37 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s11, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_sub_i32 s2, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX8-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
+; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[12:13], s2
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
-; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s14, s10, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s10
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[12:13], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT:    s_sub_i32 s9, s10, 64
+; GFX8-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[12:13], s[6:7], s10
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[6:7], s11
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX8-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
@@ -5906,40 +5900,37 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s11, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_sub_i32 s2, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
+; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[12:13], s2
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
-; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s14, s10, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s10
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[12:13], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s9, s10, 64
+; GFX9-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
-; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[12:13], s[6:7], s10
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[6:7], s11
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
@@ -5954,37 +5945,34 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX10-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s14, s10, 64
+; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
 ; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -6000,37 +5988,35 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
-; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX11-NEXT:    s_or_b64 s[8:9], s[12:13], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s14, s10, 64
+; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
 ; GFX11-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
+; GFX11-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_bfe_u64 s[4:5], s[8:9], 0x10000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], s[10:11]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6041,145 +6027,127 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT:    v_and_b32_e32 v12, 0x7f, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
+; GFX6-NEXT:    v_and_b32_e32 v13, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v15
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v13
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v0
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v15
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, 64, v15
-; GFX6-NEXT:    v_lshl_b64 v[12:13], v[8:9], v15
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[2:3], v13
+; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, 64, v13
 ; GFX6-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[8:9], v16
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[8:9], v14
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v14
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v14
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v15, vcc, 64, v14
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v15
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[6:7], v14
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v12
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], v12
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[6:7], v8
+; GFX6-NEXT:    v_subrev_i32_e32 v13, vcc, 64, v12
+; GFX6-NEXT:    v_or_b32_e32 v8, v0, v8
+; GFX6-NEXT:    v_or_b32_e32 v9, v1, v9
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], v13
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[6:7], v12
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v12, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT:    v_and_b32_e32 v12, 0x7f, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
+; GFX8-NEXT:    v_and_b32_e32 v13, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v15
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v13
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, 64, v15
-; GFX8-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v13, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, 64, v13
 ; GFX8-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v14, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v14
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, 64, v14
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v12
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v12, v[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, 64, v12
+; GFX8-NEXT:    v_or_b32_e32 v8, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v1, v9
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v13, v[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v12, v[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT:    v_and_b32_e32 v12, 0x7f, v8
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
+; GFX9-NEXT:    v_and_b32_e32 v13, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v15
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v13
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v15, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v16, 64, v15
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v15, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v13, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v14, 64, v13
 ; GFX9-NEXT:    v_or_b32_e32 v10, v0, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v1, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v16, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, 0, v13, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v14, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v14
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v14, v[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v12, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT:    v_subrev_u32_e32 v13, 64, v12
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v15, v[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v14, v[6:7]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v13, v[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v12, v[6:7]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v12, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v8, v10
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v9, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i128:
@@ -6188,98 +6156,90 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_not_b32_e32 v9, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
-; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v8
 ; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX10-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX10-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v21, v[6:7]
+; GFX10-NEXT:    v_or_b32_e32 v8, v10, v8
+; GFX10-NEXT:    v_or_b32_e32 v9, v11, v9
+; GFX10-NEXT:    v_or_b32_e32 v10, v14, v16
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v19
-; GFX10-NEXT:    v_or_b32_e32 v12, v12, v16
-; GFX10-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX10-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v13, v13, v17
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v19
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v0, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v1, v11, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v14, v4
-; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v8, v15, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v10, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v6, v5, s5
+; GFX10-NEXT:    v_and_or_b32 v2, v7, v12, v2
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v13, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v9
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
 ; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 64, v18
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v20, 64, v18
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v14, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], v19, v[4:5]
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v20, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v8, v10, v8
+; GFX11-NEXT:    v_or_b32_e32 v9, v11, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v21, 64, v19
+; GFX11-NEXT:    v_lshrrev_b64 v[14:15], v19, v[4:5]
+; GFX11-NEXT:    v_lshrrev_b64 v[12:13], v19, v[6:7]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v8
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v21, v[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v16, v[6:7]
+; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v21, v[6:7]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v19
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v16
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v17
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v19, v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0, v15, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v18
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v10, v14, v16
+; GFX11-NEXT:    v_or_b32_e32 v8, v15, v17
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v13, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v14, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, v10, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v6, v5, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_and_or_b32 v2, v7, v12, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   ret i128 %result
@@ -6288,106 +6248,94 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128_ssv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v6, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX6-NEXT:    v_not_b32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s9, 0
-; GFX6-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT:    v_and_b32_e32 v4, 0x7f, v0
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v7
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v4
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], s[10:11], v0
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v7
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v7
-; GFX6-NEXT:    v_lshl_b64 v[4:5], s[10:11], v7
+; GFX6-NEXT:    v_lshl_b64 v[2:3], s[0:1], v4
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 64, v4
 ; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v8
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], s[10:11], v5
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v6
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v6
-; GFX6-NEXT:    v_lshl_b64 v[2:3], s[6:7], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v6
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT:    v_lshr_b64 v[4:5], s[6:7], v6
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s5
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[4:5], v8
+; GFX6-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
+; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v8
+; GFX6-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX6-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX6-NEXT:    v_lshr_b64 v[0:1], s[6:7], v9
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX6-NEXT:    v_lshr_b64 v[6:7], s[6:7], v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v5, s5
+; GFX6-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_ssv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_and_b32_e32 v6, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0x7f, v0
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_mov_b32 s9, 0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0x7f, v0
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v7
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v4
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v4, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 64, v4
 ; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v5, s[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v6
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, 64, v6
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v8, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v4, s[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, 64, v8
+; GFX8-NEXT:    v_or_b32_e32 v4, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v5, v1, v5
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v9, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_ssv:
@@ -6395,51 +6343,45 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0x7f, v0
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s9, 0
-; GFX9-NEXT:    v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7f, v0
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v7
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v7, s[10:11]
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v4, s[0:1]
+; GFX9-NEXT:    v_subrev_u32_e32 v5, 64, v4
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, s[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, 0, v5, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v5, s[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v6, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX9-NEXT:    v_subrev_u32_e32 v11, 64, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v9, 64, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v9, s[6:7]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v6, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v4, v7
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v5, v8
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_ssv:
@@ -6450,96 +6392,82 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 64, v13
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v13
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v12
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v6, s[6:7]
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v2, s[10:11]
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, s[10:11]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v12, s[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, s[6:7]
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v8
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX10-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v0, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v10, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v11, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v1, s3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, s8, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, s9, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX10-NEXT:    v_or_b32_e32 v0, v6, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, s5, s1
+; GFX10-NEXT:    v_and_or_b32 v2, v5, v2, v6
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v7
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_not_b32_e32 v1, v0
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
 ; GFX11-NEXT:    s_mov_b32 s9, 0
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v1
-; GFX11-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v12, s[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0x7f, v0
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 64, v13
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v13
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[8:9]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v8, 64, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v14, 64, v13
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v12
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v12, s[2:3]
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v13, s[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[6:7]
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v6, s[6:7]
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v2, s[10:11]
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, s[10:11]
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, s[6:7]
 ; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v14, s[6:7]
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v3, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v0, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v10, v4, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v11, v5, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v1, s3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v8, s8, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v10, s9, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s5, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v6, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, s5, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT:    v_and_or_b32 v2, v5, v2, v6
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v7
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6557,47 +6485,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s7, s4, 64
-; GFX6-NEXT:    s_sub_i32 s5, 64, s4
+; GFX6-NEXT:    s_sub_i32 s2, 64, s4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
+; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s0, s6, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s6
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[8:9], s7
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s3, s6, 64
+; GFX6-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
+; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s6
-; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s7
+; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s2
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_and_b32 s3, 1, s5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX6-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX6-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
@@ -6610,47 +6535,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s7, s4, 64
-; GFX8-NEXT:    s_sub_i32 s5, 64, s4
+; GFX8-NEXT:    s_sub_i32 s2, 64, s4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
+; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s0, s6, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s6
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[8:9], s7
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_sub_i32 s3, s6, 64
+; GFX8-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s7
+; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX8-NEXT:    s_and_b32 s3, 1, s2
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_and_b32 s3, 1, s5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX8-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
@@ -6663,47 +6585,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s7, s4, 64
-; GFX9-NEXT:    s_sub_i32 s5, 64, s4
+; GFX9-NEXT:    s_sub_i32 s2, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
+; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s0, s6, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s6
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[8:9], s7
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s3, s6, 64
+; GFX9-NEXT:    s_sub_i32 s4, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s7
+; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s3, v[2:3]
+; GFX9-NEXT:    s_and_b32 s3, 1, s2
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_and_b32 s3, 1, s5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s5, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, s2, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, s3, v9
+; GFX9-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
@@ -6719,44 +6638,39 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s6
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s6, 64
+; GFX10-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_sub_i32 s2, 64, s6
+; GFX10-NEXT:    s_sub_i32 s3, s6, 64
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10-NEXT:    s_and_b32 s3, 1, s2
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s7
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_and_b32 s3, 1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX10-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i128_svs:
@@ -6772,47 +6686,38 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s6
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s6, 64
+; GFX11-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[2:3], s[0:1]
+; GFX11-NEXT:    s_sub_i32 s2, 64, s6
+; GFX11-NEXT:    s_sub_i32 s3, s6, 64
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s2, v[2:3]
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s3, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_and_b32 s3, 1, s2
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s1
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s7
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_and_b32 s3, 1, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
-; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s5, v1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX11-NEXT:    v_and_or_b32 v2, s2, v2, s0
+; GFX11-NEXT:    v_and_or_b32 v3, s3, v3, s1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6822,161 +6727,152 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_vss:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    s_sub_i32 s5, s4, 64
-; GFX6-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX6-NEXT:    s_sub_i32 s5, s4, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s8
-; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s9
-; GFX6-NEXT:    s_sub_i32 s10, s6, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s6
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
+; GFX6-NEXT:    s_and_b32 s5, 1, s8
+; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT:    s_and_b32 s4, 1, s4
+; GFX6-NEXT:    s_sub_i32 s5, s6, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s6
 ; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[2:3], s6
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX6-NEXT:    v_or_b32_e32 v0, s0, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, s1, v5
-; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x10000
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_vss:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    s_sub_i32 s5, s4, 64
-; GFX8-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX8-NEXT:    s_sub_i32 s5, s4, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s8
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_and_b32 s4, 1, s9
-; GFX8-NEXT:    s_sub_i32 s10, s6, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s6
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
+; GFX8-NEXT:    s_and_b32 s5, 1, s8
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT:    s_and_b32 s4, 1, s4
+; GFX8-NEXT:    s_sub_i32 s5, s6, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s6
 ; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[2:3], s6
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, s1, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x10000
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_vss:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
 ; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    s_sub_i32 s5, s4, 64
-; GFX9-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
+; GFX9-NEXT:    s_sub_i32 s5, s4, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s8
-; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_and_b32 s4, 1, s9
-; GFX9-NEXT:    s_sub_i32 s10, s6, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s6
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
+; GFX9-NEXT:    s_and_b32 s5, 1, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT:    s_and_b32 s4, 1, s4
+; GFX9-NEXT:    s_sub_i32 s5, s6, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s6
 ; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[2:3], s6
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s5
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[4:5], 0
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v4
-; GFX9-NEXT:    v_or_b32_e32 v1, s1, v5
-; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[4:5], 0x10000
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], s[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v0
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_vss:
@@ -6984,50 +6880,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT:    s_sub_i32 s7, 64, s4
+; GFX10-NEXT:    s_andn2_b64 s[6:7], 0x7f, s[4:5]
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], 0x7f
+; GFX10-NEXT:    s_sub_i32 s5, 64, s6
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT:    s_sub_i32 s5, s4, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s8
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT:    s_and_b32 s4, 1, s9
-; GFX10-NEXT:    s_sub_i32 s10, s6, 64
-; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_sub_i32 s7, s4, 64
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_and_b32 s5, 1, s6
+; GFX10-NEXT:    s_sub_i32 s10, 64, s4
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s4
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
-; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX10-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -7037,48 +6930,46 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s7, 64, s4
+; GFX11-NEXT:    s_and_not1_b64 s[6:7], 0x7f, s[4:5]
+; GFX11-NEXT:    s_and_b64 s[4:5], s[4:5], 0x7f
+; GFX11-NEXT:    s_sub_i32 s5, 64, s6
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT:    s_sub_i32 s5, s4, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_and_b32 s4, 1, s8
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT:    s_and_b32 s4, 1, s9
-; GFX11-NEXT:    s_sub_i32 s10, s6, 64
-; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_sub_i32 s7, s6, 64
 ; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
-; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s5, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s7, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_and_b32 s5, 1, s5
+; GFX11-NEXT:    s_sub_i32 s7, s4, 64
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT:    s_and_b32 s5, 1, s6
+; GFX11-NEXT:    s_sub_i32 s10, 64, s4
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s4
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s7
+; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
-; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
+; GFX11-NEXT:    s_bfe_u64 s[2:3], s[6:7], 0x10000
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -7223,40 +7114,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
 ; GFX6-NEXT:    s_sub_i32 s19, s16, 64
-; GFX6-NEXT:    s_sub_i32 s17, 64, s16
+; GFX6-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
+; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[22:23], s2
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[16:17]
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[22:23], s19
 ; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s24, s18, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s18
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX6-NEXT:    s_sub_i32 s17, s18, 64
+; GFX6-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[10:11], s18
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[10:11], s19
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[22:23]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
@@ -7264,40 +7152,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
 ; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX6-NEXT:    s_sub_i32 s9, s10, 64
-; GFX6-NEXT:    s_sub_i32 s11, 64, s10
+; GFX6-NEXT:    s_sub_i32 s6, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX6-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
-; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX6-NEXT:    s_sub_i32 s18, s8, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[16:17], s9
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX6-NEXT:    s_sub_i32 s11, s8, 64
+; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
-; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[14:15], s8
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[14:15], s9
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], s11
+; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
@@ -7310,40 +7195,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
 ; GFX8-NEXT:    s_sub_i32 s19, s16, 64
-; GFX8-NEXT:    s_sub_i32 s17, 64, s16
+; GFX8-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[2:3], s[22:23], s2
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[16:17]
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[22:23], s19
 ; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s24, s18, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s18
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT:    s_sub_i32 s17, s18, 64
+; GFX8-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[10:11], s18
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[10:11], s19
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[22:23]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
@@ -7351,40 +7233,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
 ; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX8-NEXT:    s_sub_i32 s9, s10, 64
-; GFX8-NEXT:    s_sub_i32 s11, 64, s10
+; GFX8-NEXT:    s_sub_i32 s6, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX8-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
-; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX8-NEXT:    s_sub_i32 s18, s8, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[16:17], s9
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX8-NEXT:    s_sub_i32 s11, s8, 64
+; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
-; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[14:15], s8
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[14:15], s9
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], s11
+; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX8-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
@@ -7397,40 +7276,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
 ; GFX9-NEXT:    s_sub_i32 s19, s16, 64
-; GFX9-NEXT:    s_sub_i32 s17, 64, s16
+; GFX9-NEXT:    s_sub_i32 s2, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[22:23], s2
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[16:17]
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[22:23], s19
 ; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s24, s18, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s18
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[0:1], s[2:3]
+; GFX9-NEXT:    s_sub_i32 s17, s18, 64
+; GFX9-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[10:11], s18
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[10:11], s19
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[22:23]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
@@ -7438,40 +7314,37 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX9-NEXT:    s_sub_i32 s9, s10, 64
-; GFX9-NEXT:    s_sub_i32 s11, 64, s10
+; GFX9-NEXT:    s_sub_i32 s6, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[16:17], s10
-; GFX9-NEXT:    s_lshr_b64 s[18:19], s[16:17], s11
+; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[16:17], s6
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT:    s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[16:17], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[10:11], s[16:17]
-; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX9-NEXT:    s_sub_i32 s18, s8, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[16:17], s9
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-NEXT:    s_sub_i32 s11, s8, 64
+; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s8
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[12:13], s8
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[16:17]
-; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[12:13], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[14:15], s8
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[14:15], s9
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], s11
+; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[10:11], 0x10000
+; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], s[16:17]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
@@ -7488,76 +7361,70 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s16
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX10-NEXT:    s_or_b64 s[16:17], s[24:25], s[16:17]
 ; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s22, s18, 64
+; GFX10-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX10-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
 ; GFX10-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX10-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
 ; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_i32 s18, s8, 64
+; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
 ; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[14:15], s11
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT:    s_bfe_u64 s[10:11], s[10:11], 0x10000
+; GFX10-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
 ; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
@@ -7575,76 +7442,71 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
 ; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
-; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s16
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX11-NEXT:    s_or_b64 s[16:17], s[24:25], s[16:17]
 ; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[16:17], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s22, s18, 64
+; GFX11-NEXT:    s_sub_i32 s17, s18, 64
 ; GFX11-NEXT:    s_sub_i32 s19, 64, s18
 ; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
 ; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
 ; GFX11-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s17
+; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX11-NEXT:    s_bfe_u64 s[8:9], s[16:17], 0x10000
 ; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[8:9], s[18:19]
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX11-NEXT:    s_lshr_b32 s22, s5, 31
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s11
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s10
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
-; GFX11-NEXT:    s_sub_i32 s18, s8, 64
+; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], s8
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[14:15], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[14:15], s8
 ; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_lshr_b64 s[14:15], s[14:15], s11
+; GFX11-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT:    s_bfe_u64 s[10:11], s[10:11], 0x10000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
@@ -7655,274 +7517,238 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX6-NEXT:    v_and_b32_e32 v21, 0x7f, v16
 ; GFX6-NEXT:    v_not_b32_e32 v16, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
+; GFX6-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX6-NEXT:    v_lshl_b64 v[16:17], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v24
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 64, v22
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[16:17], v0
-; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v24
-; GFX6-NEXT:    v_subrev_i32_e32 v25, vcc, 64, v24
-; GFX6-NEXT:    v_lshl_b64 v[21:22], v[16:17], v24
+; GFX6-NEXT:    v_lshl_b64 v[18:19], v[2:3], v22
+; GFX6-NEXT:    v_subrev_i32_e32 v23, vcc, 64, v22
 ; GFX6-NEXT:    v_or_b32_e32 v18, v0, v18
 ; GFX6-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v25
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[16:17], v23
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 64, v23
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v23
-; GFX6-NEXT:    v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT:    v_subrev_i32_e32 v24, vcc, 64, v23
-; GFX6-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT:    v_lshr_b64 v[16:17], v[10:11], v23
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 64, v21
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[8:9], v21
+; GFX6-NEXT:    v_lshl_b64 v[16:17], v[10:11], v16
+; GFX6-NEXT:    v_subrev_i32_e32 v22, vcc, 64, v21
+; GFX6-NEXT:    v_or_b32_e32 v16, v0, v16
+; GFX6-NEXT:    v_or_b32_e32 v17, v1, v17
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[10:11], v22
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v21
+; GFX6-NEXT:    v_lshr_b64 v[18:19], v[10:11], v21
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v20
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX6-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX6-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v19
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v17
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v19
-; GFX6-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX6-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT:    v_subrev_i32_e32 v20, vcc, 64, v19
-; GFX6-NEXT:    v_lshl_b64 v[16:17], v[8:9], v19
+; GFX6-NEXT:    v_lshl_b64 v[10:11], v[6:7], v17
+; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v17
 ; GFX6-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX6-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v20
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX6-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX6-NEXT:    v_lshl_b64 v[4:5], v[8:9], v18
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX6-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 64, v18
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v18
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[14:15], v6
-; GFX6-NEXT:    v_subrev_i32_e32 v19, vcc, 64, v18
-; GFX6-NEXT:    v_or_b32_e32 v6, v4, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v19
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[14:15], v18
-; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v16
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[12:13], v16
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[14:15], v8
+; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, 64, v16
+; GFX6-NEXT:    v_or_b32_e32 v8, v4, v8
+; GFX6-NEXT:    v_or_b32_e32 v9, v5, v9
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[14:15], v17
+; GFX6-NEXT:    v_lshr_b64 v[10:11], v[14:15], v16
+; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX6-NEXT:    v_and_b32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v22, v1
-; GFX6-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX6-NEXT:    v_or_b32_e32 v6, v10, v6
-; GFX6-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX8-NEXT:    v_and_b32_e32 v21, 0x7f, v16
 ; GFX8-NEXT:    v_not_b32_e32 v16, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
+; GFX8-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v24
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 64, v22
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX8-NEXT:    v_subrev_u32_e32 v25, vcc, 64, v24
-; GFX8-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
+; GFX8-NEXT:    v_lshlrev_b64 v[18:19], v22, v[2:3]
+; GFX8-NEXT:    v_subrev_u32_e32 v23, vcc, 64, v22
 ; GFX8-NEXT:    v_or_b32_e32 v18, v0, v18
 ; GFX8-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v23, v[16:17]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 64, v23
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT:    v_subrev_u32_e32 v24, vcc, 64, v23
-; GFX8-NEXT:    v_or_b32_e32 v2, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, 64, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v21, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v16, v[10:11]
+; GFX8-NEXT:    v_subrev_u32_e32 v22, vcc, 64, v21
+; GFX8-NEXT:    v_or_b32_e32 v16, v0, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, v1, v17
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v22, v[10:11]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v21
+; GFX8-NEXT:    v_lshrrev_b64 v[18:19], v21, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v20
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v19
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v17
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX8-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT:    v_subrev_u32_e32 v20, vcc, 64, v19
-; GFX8-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX8-NEXT:    v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX8-NEXT:    v_subrev_u32_e32 v18, vcc, 64, v17
 ; GFX8-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX8-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v18, v[8:9]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX8-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 64, v18
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, 64, v18
-; GFX8-NEXT:    v_or_b32_e32 v6, v4, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], v18, v[14:15]
-; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v16
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[14:15]
+; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, 64, v16
+; GFX8-NEXT:    v_or_b32_e32 v8, v4, v8
+; GFX8-NEXT:    v_or_b32_e32 v9, v5, v9
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v17, v[14:15]
+; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v16, v[14:15]
+; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX8-NEXT:    v_and_b32_e32 v8, v8, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v22, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX8-NEXT:    v_or_b32_e32 v6, v10, v6
-; GFX8-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
+; GFX9-NEXT:    v_and_b32_e32 v21, 0x7f, v16
 ; GFX9-NEXT:    v_not_b32_e32 v16, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
+; GFX9-NEXT:    v_and_b32_e32 v22, 0x7f, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[16:17], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v24
+; GFX9-NEXT:    v_sub_u32_e32 v0, 64, v22
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX9-NEXT:    v_subrev_u32_e32 v25, 64, v24
-; GFX9-NEXT:    v_lshlrev_b64 v[21:22], v24, v[16:17]
+; GFX9-NEXT:    v_lshlrev_b64 v[18:19], v22, v[2:3]
+; GFX9-NEXT:    v_subrev_u32_e32 v23, 64, v22
 ; GFX9-NEXT:    v_or_b32_e32 v18, v0, v18
 ; GFX9-NEXT:    v_or_b32_e32 v19, v1, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, 0, v22, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v23, v[16:17]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
 ; GFX9-NEXT:    v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v23
+; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v21
 ; GFX9-NEXT:    v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v21, v[8:9]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT:    v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT:    v_subrev_u32_e32 v22, 64, v21
 ; GFX9-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v22, v[10:11]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[16:17], v21, v[10:11]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GFX9-NEXT:    v_not_b32_e32 v8, v20
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX9-NEXT:    v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT:    v_and_b32_e32 v19, 0x7f, v8
+; GFX9-NEXT:    v_and_or_b32 v3, 0, v17, v19
+; GFX9-NEXT:    v_and_b32_e32 v17, 0x7f, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], 1, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v17
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX9-NEXT:    v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT:    v_subrev_u32_e32 v20, 64, v19
-; GFX9-NEXT:    v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v17, v[6:7]
+; GFX9-NEXT:    v_and_or_b32 v2, v2, v16, v18
+; GFX9-NEXT:    v_subrev_u32_e32 v18, 64, v17
 ; GFX9-NEXT:    v_or_b32_e32 v10, v4, v10
 ; GFX9-NEXT:    v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v16, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v18, v[8:9]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v17
+; GFX9-NEXT:    v_and_b32_e32 v16, 0x7f, v20
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v18
+; GFX9-NEXT:    v_sub_u32_e32 v6, 64, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v5, v7, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v18, v[12:13]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v16, v[12:13]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[14:15]
-; GFX9-NEXT:    v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT:    v_subrev_u32_e32 v17, 64, v16
 ; GFX9-NEXT:    v_or_b32_e32 v6, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v19, v[14:15]
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v18, v[14:15]
-; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v17, v[14:15]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v16, v[14:15]
+; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, v21, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, v22, v1
-; GFX9-NEXT:    v_or_b32_e32 v4, v16, v4
-; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
-; GFX9-NEXT:    v_or_b32_e32 v6, v10, v6
-; GFX9-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX9-NEXT:    v_and_or_b32 v6, v6, v8, v10
+; GFX9-NEXT:    v_and_or_b32 v7, 0, v9, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i128:
@@ -7930,90 +7756,78 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v17, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT:    v_and_b32_e32 v26, 0x7f, v16
+; GFX10-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v17
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v26
-; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v25
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v19
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v19, 64, v25
-; GFX10-NEXT:    v_lshlrev_b64 v[23:24], v25, v[0:1]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v23, 64, v19
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
 ; GFX10-NEXT:    v_lshrrev_b64 v[17:18], v18, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v19, v[0:1]
-; GFX10-NEXT:    v_cndmask_b32_e32 v23, 0, v23, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v24, 0, v24, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v26
-; GFX10-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX10-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX10-NEXT:    v_cndmask_b32_e32 v22, v1, v22, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b64 v[18:19], v18, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v21, v0, v21, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v27, v[10:11]
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
-; GFX10-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX10-NEXT:    v_or_b32_e32 v17, v17, v19
+; GFX10-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX10-NEXT:    v_or_b32_e32 v16, v17, v21
+; GFX10-NEXT:    v_or_b32_e32 v17, v18, v22
+; GFX10-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX10-NEXT:    v_cndmask_b32_e32 v21, v0, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v1, v17, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v24, v[8:9]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v18, v[10:11]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v19, 64, v24
 ; GFX10-NEXT:    v_cndmask_b32_e32 v18, v21, v2, vcc_lo
+; GFX10-NEXT:    v_not_b32_e32 v21, v20
 ; GFX10-NEXT:    v_cndmask_b32_e32 v22, v22, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s4
-; GFX10-NEXT:    v_not_b32_e32 v16, v20
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v25, 0x7f, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v24, v[10:11]
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v19, v[10:11]
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX10-NEXT:    v_and_b32_e32 v19, 0x7f, v21
+; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 31, v5
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX10-NEXT:    v_and_b32_e32 v23, 0x7f, v20
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s4
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
-; GFX10-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
-; GFX10-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
-; GFX10-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v25, v[4:5]
-; GFX10-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
-; GFX10-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v23
-; GFX10-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX10-NEXT:    v_or_b32_e32 v5, v9, v11
-; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v8, v[14:15]
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s4, 64, v23
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX10-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, v3, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX10-NEXT:    v_lshrrev_b64 v[3:4], v23, v[14:15]
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v16, s4
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v23
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v25
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v9, v18, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX10-NEXT:    v_or_b32_e32 v1, v24, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v12, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v9, v13, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s4
-; GFX10-NEXT:    v_or_b32_e32 v3, v22, v26
-; GFX10-NEXT:    v_or_b32_e32 v4, v11, v5
-; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX10-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX10-NEXT:    v_sub_nc_u32_e32 v17, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v6, v6, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v20, 0x7f, v20
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v17, v[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v19, v[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX10-NEXT:    v_sub_nc_u32_e32 v23, 64, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX10-NEXT:    v_or_b32_e32 v24, v10, v16
+; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX10-NEXT:    v_or_b32_e32 v25, v11, v17
+; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v20, v[12:13]
+; GFX10-NEXT:    v_lshlrev_b64 v[16:17], v23, v[14:15]
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s4
+; GFX10-NEXT:    v_and_or_b32 v2, v21, v2, v18
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v4, v24, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v18, v5, v25, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[14:15]
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v19
+; GFX10-NEXT:    v_or_b32_e32 v10, v10, v16
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v20
+; GFX10-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 0, v20
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s4
+; GFX10-NEXT:    v_lshrrev_b64 v[8:9], v20, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v18, v7, s4
+; GFX10-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s5
+; GFX10-NEXT:    v_and_or_b32 v6, v10, v8, v6
+; GFX10-NEXT:    v_and_or_b32 v7, 0, v9, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i128:
@@ -8021,100 +7835,80 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v17, v16
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    v_and_b32_e32 v24, 0x7f, v16
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v25, 0x7f, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v17
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v17
-; GFX11-NEXT:    v_lshlrev_b64 v[23:24], v25, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v23, 0, v23 :: v_dual_and_b32 v26, 0x7f, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, 0, v24, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v25
-; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v25, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v19, 64, v25
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v27, 64, v26
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v26
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v23, 64, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
 ; GFX11-NEXT:    v_lshrrev_b64 v[17:18], v18, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v19, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v22, v18, v22
-; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v21, v17, v21
-; GFX11-NEXT:    v_lshrrev_b64 v[16:17], v26, v[8:9]
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v1, v22, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[18:19], v18, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v0, v21, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v27, v[10:11]
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v19
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v16, s0
-; GFX11-NEXT:    v_not_b32_e32 v16, v20
+; GFX11-NEXT:    v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v23, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v16, v17, v21
+; GFX11-NEXT:    v_or_b32_e32 v17, v18, v22
+; GFX11-NEXT:    v_sub_nc_u32_e32 v18, 64, v24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v21, v0, v16 :: v_dual_cndmask_b32 v22, v1, v17
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], v24, v[8:9]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v18, v[10:11]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v19, 64, v24
 ; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s0
+; GFX11-NEXT:    v_not_b32_e32 v21, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0x7f, v16
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v26, v[10:11]
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v24, v[10:11]
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v19, v[10:11]
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-NEXT:    v_and_b32_e32 v19, 0x7f, v21
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 31, v5
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v24
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v25
-; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, v3, s0
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 64, v25
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v9, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v25, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT:    v_lshlrev_b64 v[10:11], v25, v[6:7]
-; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v25
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v25
-; GFX11-NEXT:    v_or_b32_e32 v1, v24, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, v8, v10
-; GFX11-NEXT:    v_and_b32_e32 v23, 0x7f, v20
-; GFX11-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10
-; GFX11-NEXT:    v_sub_nc_u32_e32 v20, 64, v23
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v23
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], v23, v[12:13]
-; GFX11-NEXT:    v_cmp_gt_u32_e64 s0, 64, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v8, v[14:15]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], v23, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v23
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s2
-; GFX11-NEXT:    v_or_b32_e32 v16, v18, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v21
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v5, v7, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, v4, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, v16, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, v9, v18, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v10
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v8, v12, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v9, v13, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, v3, s0
-; GFX11-NEXT:    v_or_b32_e32 v3, v22, v26
-; GFX11-NEXT:    v_or_b32_e32 v4, v11, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v5, v14, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-NEXT:    v_sub_nc_u32_e32 v17, 64, v19
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v16
+; GFX11-NEXT:    v_and_b32_e32 v20, 0x7f, v20
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v10, v0 :: v_dual_cndmask_b32 v1, v11, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v24
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v17, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v19, v[6:7]
+; GFX11-NEXT:    v_sub_nc_u32_e32 v23, 64, v20
+; GFX11-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s0
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v19
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v19
+; GFX11-NEXT:    v_or_b32_e32 v24, v10, v16
+; GFX11-NEXT:    v_or_b32_e32 v25, v11, v17
+; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v20, v[12:13]
+; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
+; GFX11-NEXT:    v_lshlrev_b64 v[16:17], v23, v[14:15]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v20
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
+; GFX11-NEXT:    v_and_or_b32 v2, v21, v2, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v19
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v4, v24 :: v_dual_cndmask_b32 v18, v5, v25
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v8, v[14:15]
+; GFX11-NEXT:    v_or_b32_e32 v10, v10, v16
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v20
+; GFX11-NEXT:    v_or_b32_e32 v11, v11, v17
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s0
+; GFX11-NEXT:    v_lshrrev_b64 v[8:9], v20, v[14:15]
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v18, v7, s0
+; GFX11-NEXT:    v_and_or_b32 v3, 0, v3, v22
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s1
+; GFX11-NEXT:    v_and_or_b32 v6, v10, v8, v6
+; GFX11-NEXT:    v_and_or_b32 v7, 0, v9, v7
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index b50af195e40ef2..15038b2da9e734 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -674,10 +674,12 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; SI-NEXT:  .LBB6_3: ; %.continue0
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; SI-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
-; SI-NEXT:    s_nop 1
+; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -722,10 +724,12 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:  .LBB6_3: ; %.continue0
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -770,8 +774,10 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:  .LBB6_3: ; %.continue0
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_xor_b32 s2, s0, -1
 ; GFX10-32-NEXT:    s_mov_b32 s1, s0
-; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-32-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -816,8 +822,10 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:  .LBB6_3: ; %.continue0
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
-; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX10-64-NEXT:    v_and_b32_e32 v0, 1.0, v0
 ; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -911,10 +919,12 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_cbranch_execz .LBB7_8
 ; SI-NEXT:  .LBB7_5: ; %.continue0
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; SI-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
+; SI-NEXT:    v_and_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_mov_b32_e32 v3, v2
-; SI-NEXT:    s_nop 1
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -977,10 +987,12 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX9-NEXT:  .LBB7_5: ; %.continue0
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1042,8 +1054,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX10-32-NEXT:  .LBB7_5: ; %.continue0
 ; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
 ; GFX10-32-NEXT:    s_mov_b32 s2, s0
-; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s2
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s3
+; GFX10-32-NEXT:    v_and_b32_e32 v2, v2, v0
 ; GFX10-32-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-32-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-32-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1106,8 +1120,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    s_cbranch_execz .LBB7_8
 ; GFX10-64-NEXT:  .LBB7_5: ; %.continue0
 ; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-64-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
 ; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[6:7]
+; GFX10-64-NEXT:    v_and_b32_e32 v2, v2, v0
 ; GFX10-64-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-64-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-64-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 980ba3da4bac7a..508f226e2872e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1583,14 +1583,15 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v3
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v2
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65:
@@ -1605,14 +1606,15 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v2, v[4:5]
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65:
@@ -1627,14 +1629,15 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v2, v[4:5]
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v8
-; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v10
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65:
@@ -1650,13 +1653,14 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_or_b32_e32 v2, v6, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, v7, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65:
@@ -1671,13 +1675,14 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_or_b32_e32 v2, v6, v8
 ; GFX11-NEXT:    v_or_b32_e32 v6, v7, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    v_and_b32_e32 v2, v3, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, %amount
   ret i65 %result
@@ -1750,45 +1755,45 @@ define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-LABEL: s_lshr_i65:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[4:5], s[2:3], 1
-; GCN-NEXT:    s_sub_i32 s10, s3, 64
-; GCN-NEXT:    s_sub_i32 s8, 64, s3
+; GCN-NEXT:    s_sub_i32 s12, s3, 64
+; GCN-NEXT:    s_sub_i32 s10, 64, s3
 ; GCN-NEXT:    s_cmp_lt_u32 s3, 64
-; GCN-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-NEXT:    s_cselect_b32 s2, 1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-NEXT:    s_cselect_b32 s12, 1, 0
+; GCN-NEXT:    s_cselect_b32 s13, 1, 0
+; GCN-NEXT:    s_lshr_b64 s[8:9], s[0:1], s3
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[4:5], s3
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GCN-NEXT:    s_cmp_lg_u32 s13, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], s[6:7]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[4:5], s[2:3], 1
-; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT:    s_sub_i32 s2, 64, s3
+; GFX10PLUS-NEXT:    s_sub_i32 s12, s3, 64
+; GFX10PLUS-NEXT:    s_sub_i32 s8, 64, s3
 ; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s13, 1, 0
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s3
-; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
-; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[4:5], s3
+; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
+; GFX10PLUS-NEXT:    s_lshr_b64 s[10:11], s[4:5], s3
 ; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[4:5], s12
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX10PLUS-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], s[10:11]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, %amount
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0ea89d98..bb06252902fd34 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5126,9 +5126,10 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[6:7], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5168,14 +5169,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5215,14 +5215,13 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5259,16 +5258,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s10
-; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX10-NEXT:    v_and_b32_e32 v1, s1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX10-NEXT:    s_add_u32 s1, s0, 0x80000000
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
@@ -5300,16 +5298,14 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[6:7], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s10
-; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, s1, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX11-NEXT:    s_add_u32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
@@ -5346,9 +5342,10 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX6-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX6-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v3, v6
@@ -5377,9 +5374,10 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v3, v6
@@ -5408,9 +5406,10 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_bfrev_b32_e32 v6, 1
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v3, v6
@@ -5434,14 +5433,15 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_add_co_u32 v6, s0, 0x80000000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s0, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -5461,14 +5461,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v6 :: v_dual_and_b32 v2, v2, v8
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_co_u32 v6, null, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_add_co_u32 v6, null, 0x80000000, v3
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5495,12 +5495,13 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5521,7 +5522,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -5530,9 +5531,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT:    s_and_b32 s0, s4, 1
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -5555,7 +5555,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
@@ -5564,9 +5564,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s4
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT:    s_and_b32 s0, s4, 1
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -5586,19 +5585,18 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
+; GFX10-NEXT:    v_and_b32_e32 v1, s0, v8
 ; GFX10-NEXT:    v_add_co_u32 v3, s0, 0x80000000, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5616,20 +5614,19 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], 0
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; GFX11-NEXT:    v_add_co_u32 v3, null, 0x80000000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
+; GFX11-NEXT:    v_and_b32_e32 v1, s0, v8
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5655,13 +5652,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5679,13 +5677,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX6-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -5707,13 +5706,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v2, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5731,13 +5731,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX8-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -5759,13 +5760,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v2, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5783,13 +5785,14 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 0x80000000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -5810,44 +5813,46 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
-; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
+; GFX10-NEXT:    v_and_b32_e32 v1, v19, v18
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
+; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT:    v_and_b32_e32 v2, v5, v4
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
-; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_add_co_u32 v7, s5, 0x80000000, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX10-NEXT:    v_add_co_u32 v4, s4, 0x80000000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v16, v3, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v2i128:
@@ -5861,43 +5866,42 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v4, v12
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, v19, v18
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
+; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[12:13], v[6:7]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v13
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_and_b32 v2, v5, v4
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v17
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_add_co_u32 v7, null, 0x80000000, v6
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
+; GFX11-NEXT:    v_add_co_u32 v4, null, 0x80000000, v3
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v3 :: v_dual_and_b32 v5, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v17, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s0
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v3 :: v_dual_cndmask_b32 v3, v17, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v10, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -5921,9 +5925,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[10:11], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5954,9 +5959,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
+; GFX6-NEXT:    v_cmp_ne_u64_e64 s[4:5], s[14:15], 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6000,14 +6006,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s0, 1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
 ; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT:    s_and_b32 s0, 1, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8-NEXT:    s_and_b32 s0, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6039,14 +6044,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_and_b32 s4, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX8-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
 ; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s6
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX8-NEXT:    s_and_b32 s4, s6, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6090,14 +6094,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s0, 1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT:    s_and_b32 s0, 1, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v1, s0, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s17, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6129,14 +6132,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_and_b32 s4, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
 ; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s6
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
+; GFX9-NEXT:    s_and_b32 s4, s6, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -6177,17 +6179,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s18
-; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s1
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    s_add_u32 s11, s10, 0x80000000
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX10-NEXT:    s_add_u32 s0, s4, s12
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v1, s1, v2
 ; GFX10-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX10-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6204,27 +6205,26 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10-NEXT:    s_and_b32 s4, 1, s12
-; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX10-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT:    s_and_b32 s5, 1, s5
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
+; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v2, s5, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s16
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT:    s_add_u32 s0, s4, 0x80000000
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v4
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
@@ -6254,17 +6254,15 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[10:11], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s18
-; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s1, 1, s1
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
 ; GFX11-NEXT:    s_add_u32 s11, s10, 0x80000000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
 ; GFX11-NEXT:    s_add_u32 s0, s4, s12
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, s1, v2
 ; GFX11-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX11-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6274,23 +6272,23 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[14:15], 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_and_b32 v0, 1, v0
+; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX11-NEXT:    s_and_b32 s4, 1, s12
-; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX11-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX11-NEXT:    s_and_b32 s5, 1, s5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_and_b32 s5, s5, 1
+; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v2, s5, v3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s17
@@ -6299,7 +6297,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT:    s_add_u32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index c2f911cc445879..d0da2f485a1133 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1585,14 +1585,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[2:3], v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 64, v3
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[0:1], v3
-; GFX6-NEXT:    v_or_b32_e32 v9, v4, v5
+; GFX6-NEXT:    v_or_b32_e32 v7, v4, v5
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], v8
 ; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i65:
@@ -1603,14 +1604,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 64, v3
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v9, v4, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v8, v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    v_and_b32_e32 v0, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i65:
@@ -1621,14 +1623,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v3
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v9, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v7, v4, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v8, v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i65:
@@ -1636,15 +1639,16 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 64, v3
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX10-NEXT:    v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v7, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1654,14 +1658,15 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 64, v3
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
 ; GFX11-NEXT:    v_lshrrev_b64 v[5:6], v6, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v3, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v8, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v1, v5, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v7, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1720,41 +1725,43 @@ define i65 @v_shl_i65_33(i65 %value) {
 define amdgpu_ps i65 @s_shl_i65(i65 inreg %value, i65 inreg %amount) {
 ; GCN-LABEL: s_shl_i65:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_sub_i32 s10, s3, 64
-; GCN-NEXT:    s_sub_i32 s6, 64, s3
+; GCN-NEXT:    s_sub_i32 s5, s3, 64
+; GCN-NEXT:    s_sub_i32 s8, 64, s3
 ; GCN-NEXT:    s_cmp_lt_u32 s3, 64
-; GCN-NEXT:    s_cselect_b32 s11, 1, 0
+; GCN-NEXT:    s_cselect_b32 s4, 1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s3, 0
 ; GCN-NEXT:    s_cselect_b32 s12, 1, 0
-; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], s3
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[0:1], s3
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s10
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GCN-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[8:9]
+; GCN-NEXT:    s_lshr_b64 s[8:9], s[0:1], s8
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[2:3], s3
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[0:1], s3
+; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GCN-NEXT:    s_bfe_u64 s[0:1], s[4:5], 0x10000
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[10:11]
 ; GCN-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i65:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
-; GFX10PLUS-NEXT:    s_sub_i32 s4, 64, s3
+; GFX10PLUS-NEXT:    s_sub_i32 s5, s3, 64
+; GFX10PLUS-NEXT:    s_sub_i32 s6, 64, s3
 ; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
-; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
-; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], s4
-; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[2:3], s3
-; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[0:1], s3
-; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT:    s_lshl_b64 s[6:7], s[0:1], s10
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[8:9], 0
-; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[2:3], s3
+; GFX10PLUS-NEXT:    s_lshl_b64 s[10:11], s[0:1], s3
+; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10PLUS-NEXT:    s_bfe_u64 s[8:9], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_lshl_b64 s[12:13], s[0:1], s5
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[8:9], s[10:11]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[12:13]
+; GFX10PLUS-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i65 %value, %amount
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index a60370cd460f9e..ea3d0f9a841eff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2473,14 +2473,15 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i48:
@@ -2490,8 +2491,10 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2502,22 +2505,39 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i48:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i48:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i48:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
 }
@@ -2532,14 +2552,14 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_subb_u32 s2, s1, s3
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_cmp_lg_u32 s2, s1
+; GFX6-NEXT:    s_cmp_eq_u32 s2, s1
 ; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i48:
@@ -2548,7 +2568,10 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_xor_b32 s2, s2, 1
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2558,7 +2581,10 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_xor_b32 s2, s2, 1
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -2568,7 +2594,10 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 16
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
@@ -2583,14 +2612,15 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_sv:
@@ -2600,8 +2630,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2612,21 +2644,37 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i48_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i48_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i48_sv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   %ext.result = zext i48 %result to i64
   %cast = bitcast i64 %ext.result to <2 x float>
@@ -2641,14 +2689,15 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_vs:
@@ -2658,8 +2707,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2670,21 +2721,37 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i48_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i48_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i48_vs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v2, v0
+; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   %ext.result = zext i48 %result to i64
   %cast = bitcast i64 %ext.result to <2 x float>
@@ -2697,8 +2764,10 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i64:
@@ -2706,8 +2775,10 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i64:
@@ -2715,18 +2786,32 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_usubsat_i64:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_usubsat_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_usubsat_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
 }
@@ -2736,28 +2821,40 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s3
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX6-NEXT:    s_xor_b32 s2, s2, 1
+; GFX6-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s3
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_xor_b32 s2, s2, 1
+; GFX8-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_xor_b32 s2, s2, 1
+; GFX9-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s2
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s3
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s2, s2, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -2769,8 +2866,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i64_sv:
@@ -2778,8 +2877,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i64_sv:
@@ -2787,17 +2888,30 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i64_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i64_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i64_sv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   %cast = bitcast i64 %result to <2 x float>
   ret <2 x float> %cast
@@ -2809,8 +2923,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i64_vs:
@@ -2818,8 +2934,10 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i64_vs:
@@ -2827,17 +2945,30 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i64_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i64_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i64_vs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   %cast = bitcast i64 %result to <2 x float>
   ret <2 x float> %cast
@@ -2849,12 +2980,16 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v2, v6
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i64:
@@ -2862,12 +2997,16 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v2, v6
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v3, v7, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i64:
@@ -2875,38 +3014,49 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v2, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v7, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT:    v_sub_co_u32 v2, s4, v2, v6
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s4
+; GFX10-NEXT:    v_sub_co_u32 v1, s4, v2, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v2, s4, v3, v7, s4
+; GFX10-NEXT:    s_xor_b32 s5, vcc_lo, -1
+; GFX10-NEXT:    s_xor_b32 s4, s4, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
-; GFX11-NEXT:    v_sub_co_u32 v2, s0, v2, v6
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s0
+; GFX11-NEXT:    v_sub_co_u32 v1, s0, v2, v6
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v2, s0, v3, v7, s0
+; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_xor_b32 s0, s0, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -2917,40 +3067,64 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s5
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_xor_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX6-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s7
-; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_xor_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s5
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_xor_b32 s4, s4, 1
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX8-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s7
-; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_xor_b32 s4, s4, 1
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_v2i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s5
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s7
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_v2i64:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sub_u32 s0, s0, s4
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
 ; GFX10PLUS-NEXT:    s_sub_u32 s2, s2, s6
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -2963,8 +3137,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX6-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s7
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX6-NEXT:    s_xor_b32 s4, s4, 1
+; GFX6-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX6-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_i128:
@@ -2973,8 +3151,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX8-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s7
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_xor_b32 s4, s4, 1
+; GFX8-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX8-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_i128:
@@ -2983,8 +3165,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX9-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s7
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_xor_b32 s4, s4, 1
+; GFX9-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX9-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_i128:
@@ -2993,8 +3179,12 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s5
 ; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s6
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s7
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s4, s4, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[6:7], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_bfe_u64 s[4:5], s[4:5], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   ret i128 %result
@@ -3006,14 +3196,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i128_sv:
@@ -3021,14 +3213,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v4, v3, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i128_sv:
@@ -3036,27 +3230,43 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i128_sv:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
-; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i128_sv:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i128_sv:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v3, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v2, v3, v2
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
   ret <4 x float> %cast
@@ -3068,14 +3278,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v4, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v1, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; GFX6-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i128_vs:
@@ -3083,14 +3295,16 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v4, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: usubsat_i128_vs:
@@ -3098,27 +3312,43 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s3
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v4, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10PLUS-LABEL: usubsat_i128_vs:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
-; GFX10PLUS-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX10PLUS-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: usubsat_i128_vs:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: usubsat_i128_vs:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v3, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v3, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v2, v3, v2
+; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
   ret <4 x float> %cast
@@ -3130,20 +3360,24 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
-; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v5, v13, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v6, v14, vcc
-; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v7, v15, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v10, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v11, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v4, v12
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v5, v13, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v6, v14, vcc
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v7, v15, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX6-NEXT:    v_and_b32_e32 v4, v5, v1
+; GFX6-NEXT:    v_and_b32_e32 v6, v5, v3
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i128:
@@ -3151,20 +3385,24 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v11, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v13, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v14, vcc
-; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v10, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v3, v11, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v4, v12
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v5, v13, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v14, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v7, v15, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8-NEXT:    v_and_b32_e32 v4, v5, v1
+; GFX8-NEXT:    v_and_b32_e32 v6, v5, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i128:
@@ -3172,62 +3410,72 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v12
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v13, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v14, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v10, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v11, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v4, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v13, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v14, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v15, vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9-NEXT:    v_and_b32_e32 v4, v5, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, v5, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
-; GFX10-NEXT:    v_sub_co_u32 v4, s4, v4, v12
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s4
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v10, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v3, v11, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v4, v12
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v5, v13, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v14, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_and_b32_e32 v4, v6, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, v6, v5
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
-; GFX11-NEXT:    v_sub_co_u32 v4, s0, v4, v12
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v10, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v2, vcc_lo, v3, v11, vcc_lo
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v4, v12
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v5, v13, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v6, v14, vcc_lo
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v4, vcc_lo, v7, v15, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, v2, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, v6, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v6, v6, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -3240,14 +3488,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX6-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX6-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX6-NEXT:    s_subb_u32 s3, s3, s11
-; GFX6-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX6-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_xor_b32 s8, s8, 1
+; GFX6-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX6-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX6-NEXT:    s_subb_u32 s7, s7, s15
-; GFX6-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX6-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_xor_b32 s8, s8, 1
+; GFX6-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX6-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX6-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_usubsat_v2i128:
@@ -3256,14 +3512,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX8-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX8-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX8-NEXT:    s_subb_u32 s3, s3, s11
-; GFX8-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX8-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_xor_b32 s8, s8, 1
+; GFX8-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX8-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX8-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX8-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX8-NEXT:    s_subb_u32 s7, s7, s15
-; GFX8-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX8-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_xor_b32 s8, s8, 1
+; GFX8-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_usubsat_v2i128:
@@ -3272,14 +3536,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX9-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX9-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s11
-; GFX9-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_xor_b32 s8, s8, 1
+; GFX9-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX9-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX9-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s15
-; GFX9-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX9-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_xor_b32 s8, s8, 1
+; GFX9-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX9-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX9-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_usubsat_v2i128:
@@ -3288,14 +3560,22 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX10PLUS-NEXT:    s_subb_u32 s1, s1, s9
 ; GFX10PLUS-NEXT:    s_subb_u32 s2, s2, s10
 ; GFX10PLUS-NEXT:    s_subb_u32 s3, s3, s11
-; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], 0, s[0:1]
-; GFX10PLUS-NEXT:    s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[0:1], s[10:11], s[0:1]
+; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[8:9], s[2:3]
 ; GFX10PLUS-NEXT:    s_sub_u32 s4, s4, s12
 ; GFX10PLUS-NEXT:    s_subb_u32 s5, s5, s13
 ; GFX10PLUS-NEXT:    s_subb_u32 s6, s6, s14
 ; GFX10PLUS-NEXT:    s_subb_u32 s7, s7, s15
-; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], 0, s[4:5]
-; GFX10PLUS-NEXT:    s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX10PLUS-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10PLUS-NEXT:    s_xor_b32 s8, s8, 1
+; GFX10PLUS-NEXT:    s_bfe_u64 s[10:11], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_bfe_u64 s[8:9], s[8:9], 0x10000
+; GFX10PLUS-NEXT:    s_and_b64 s[4:5], s[10:11], s[4:5]
+; GFX10PLUS-NEXT:    s_and_b64 s[6:7], s[8:9], s[6:7]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 354f5b954659a5..ab326a6cef3e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1117,13 +1117,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1201,13 +1202,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1286,13 +1288,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1371,13 +1374,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 794b10eea58b9b..12d856725c682d 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1517,10 +1517,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v1, v1
@@ -1558,10 +1559,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; IR-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; IR-IEEE-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
-; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; IR-IEEE-GISEL-NEXT:    v_and_b32_e32 v1, -16, v1
 ; IR-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 97216b6c94693c..b516660f3bdc69 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -230,15 +230,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
-; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-SAFE-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
 ; VI-SAFE-GISEL-NEXT:    s_max_i32 s7, s7, 0
 ; VI-SAFE-GISEL-NEXT:    s_or_b32 s6, s2, s6
 ; VI-SAFE-GISEL-NEXT:    s_min_i32 s7, s7, 13
 ; VI-SAFE-GISEL-NEXT:    s_bitset1_b32 s2, 12
+; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
 ; VI-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s2, s7
+; VI-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
 ; VI-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s8, s7
 ; VI-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s7, s2
 ; VI-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
@@ -358,20 +359,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX10-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX10-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX10-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX10-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX10-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX10-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX10-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX10-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX10-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
@@ -497,24 +499,24 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT:    s_movk_i32 s5, 0x7e00
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s7, s2, 0x1000
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
 ; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s9, s4, 12
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
 ; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s9
-; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s8, s7, s6
-; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
 ; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 046f2624696958..2e8db0e1439a97 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1850,10 +1850,11 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1886,10 +1887,11 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1922,10 +1924,11 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1957,10 +1960,11 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s[4:5]
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2090,16 +2094,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2232,16 +2238,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, s[4:5]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, v2
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[6:7]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[6:7]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2328,10 +2336,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
@@ -2425,10 +2434,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
@@ -2509,10 +2519,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
@@ -2589,10 +2600,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v3, 5, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
@@ -2658,16 +2670,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v4, 5, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v3, -16, v3
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v2, -16, v2
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
@@ -2802,16 +2816,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v6
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v5, -16, v5
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v4, -16, v4
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -2929,16 +2945,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v6, 5, v4
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v6
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, s[4:5]
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v5, -16, v5
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v4, -16, v4
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3029,10 +3047,11 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3064,10 +3083,11 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3099,10 +3119,11 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3698,10 +3719,11 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3733,10 +3755,11 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3768,10 +3791,11 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3911,12 +3935,13 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v0, v1
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-IEEE-NEXT:    v_lshlrev_b32_e32 v2, 5, v1
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v2, v0, v2
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v2
+; GISEL-IEEE-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GISEL-IEEE-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 196a3705ac8187..32da005ccb000d 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -40,8 +40,8 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -53,11 +53,10 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -99,9 +98,9 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -v[0:1], v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -113,11 +112,10 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -160,9 +158,9 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -174,11 +172,10 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -221,9 +218,9 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -|v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -235,11 +232,10 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -284,8 +280,8 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -297,11 +293,10 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -344,8 +339,8 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -357,11 +352,10 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -404,8 +398,8 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -417,11 +411,10 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -464,8 +457,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -477,11 +470,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -533,8 +525,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -546,11 +538,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -602,8 +593,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -615,11 +606,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -671,8 +661,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -684,11 +674,10 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
@@ -740,8 +729,8 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -753,11 +742,10 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -800,8 +788,8 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -813,11 +801,10 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -860,8 +847,8 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -873,11 +860,10 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -920,8 +906,8 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -933,11 +919,10 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -980,8 +965,8 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -993,11 +978,10 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1058,14 +1042,15 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1078,23 +1063,22 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff80, v17
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
@@ -1139,8 +1123,8 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1152,11 +1136,10 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1198,9 +1181,9 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1212,11 +1195,10 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1260,8 +1242,8 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1273,11 +1255,10 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1338,14 +1319,15 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1358,23 +1340,22 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff80, v17
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
@@ -1419,8 +1400,8 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1432,11 +1413,10 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1479,8 +1459,8 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1492,11 +1472,10 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1539,8 +1518,8 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1552,11 +1531,10 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1599,8 +1577,8 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1612,11 +1590,10 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1677,14 +1654,15 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1697,23 +1675,22 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff80, v17
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
@@ -1795,61 +1772,62 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_mov_b32_e32 v6, s4
-; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
-; GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v8, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, v8, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v9
-; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[4:5], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v18
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 8, v20
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v14
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
-; GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[4:5]
-; GISEL-NEXT:    v_mul_f64 v[12:13], v[6:7], 0.5
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 8, v19
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
+; GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
 ; GISEL-NEXT:    v_mul_f64 v[6:7], v[0:1], v[6:7]
-; GISEL-NEXT:    v_mul_f64 v[14:15], v[8:9], 0.5
-; GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[8:9]
-; GISEL-NEXT:    v_mul_f64 v[16:17], v[10:11], 0.5
-; GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5
-; GISEL-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5
-; GISEL-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5
-; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
-; GISEL-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
-; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
-; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
-; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
-; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
-; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
-; GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
-; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v12, vcc
-; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, v12, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v14
-; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v13
-; GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v15
-; GISEL-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v13
-; GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v12
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[14:15], v[10:11], 0.5
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[2:3], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
+; GISEL-NEXT:    v_rsq_f64_e32 v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[16:17], v[8:9], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[8:9], -v[14:15], v[10:11], 0.5
+; GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], 0.5
+; GISEL-NEXT:    v_mul_f64 v[12:13], v[4:5], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[8:9], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[14:15], v[8:9], v[14:15]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[16:17], v[12:13], 0.5
+; GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[14:15], v[16:17], v[14:15], v[16:17]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[10:11], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[16:17], v[8:9], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[12:13], v[16:17], v[14:15], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[10:11], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[16:17], v[8:9], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[12:13], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0x260
+; GISEL-NEXT:    v_and_b32_e32 v17, 0xffffff80, v18
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v16
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v17
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v16
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v16
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[12:13]
+; GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff80, v19
+; GISEL-NEXT:    v_and_b32_e32 v13, 0xffffff80, v20
+; GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v12
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
index b3912aea55f791..fcc57b8bb7075e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -102,9 +102,9 @@ define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
 ; GISEL:       ; %bb.0: ; %entry
 ; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
+; GISEL-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL-NEXT:    s_endpgm
@@ -164,8 +164,8 @@ define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace
 ; GISEL-NEXT:    s_add_u32 s0, s0, 1
 ; GISEL-NEXT:    s_addc_u32 s1, s1, 0
 ; GISEL-NEXT:  .LBB5_2: ; %endif
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 2c07b47bf1ed59..f21e8989640e2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -53,14 +53,15 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   ret { half, i32 } %result
@@ -150,9 +151,10 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 1
@@ -227,20 +229,22 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v6, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v5, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v6
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v3
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v4, v6
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, v3, v4
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
@@ -396,12 +400,14 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, v2
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v3
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v2
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
@@ -470,14 +476,15 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   ret { half, i16 } %result
@@ -563,9 +570,10 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   %result.0 = extractvalue { half, i16 } %result, 1
@@ -631,10 +639,11 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
@@ -714,9 +723,10 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   %result.0 = extractvalue { float, i32 } %result, 1
@@ -780,14 +790,16 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v3, v4, v3
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -904,12 +916,14 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v3
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, v3
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v1, v2
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   %result.1 = extractvalue { <2 x float>, <2 x i32> } %result, 1
@@ -963,10 +977,11 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6]
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v2, v5, v2
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1053,9 +1068,10 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 1
@@ -1126,15 +1142,17 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[5:6], v[0:1]
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v4, v7, v4
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v5, v[2:3]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v5, v8, v5
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1184,18 +1202,18 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 }
 
 define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
-; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
 ; GFX8:       ; %bb.0:
@@ -1217,6 +1235,21 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], |v[0:1]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[2:3]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX6-GISEL-NEXT:    v_and_b32_e32 v1, v2, v1
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
   ret <2 x i32> %result.1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index a0b2d3b32b7957..59376d0a595fce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -5567,9 +5567,10 @@ define float @v_log_f32_undef() {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5608,15 +5609,16 @@ define float @v_log_f32_undef() {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5649,9 +5651,10 @@ define float @v_log_f32_undef() {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5686,21 +5689,22 @@ define float @v_log_f32_undef() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5ba72612321a6a..624648b73515b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -5567,9 +5567,10 @@ define float @v_log10_f32_undef() {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5608,15 +5609,16 @@ define float @v_log10_f32_undef() {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5649,9 +5651,10 @@ define float @v_log10_f32_undef() {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5686,21 +5689,22 @@ define float @v_log10_f32_undef() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 6ccef4c02ab3b1..f40c13d0762b9e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -2541,9 +2541,10 @@ define float @v_log2_f32_undef() {
 ; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX689-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX689-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX689-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -2560,10 +2561,11 @@ define float @v_log2_f32_undef() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 3dc565ceed0d0b..3363f92ad21a76 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -62,13 +62,13 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -78,9 +78,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -146,8 +145,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -159,11 +158,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -241,14 +239,14 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -258,9 +256,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -325,9 +322,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -339,11 +336,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
@@ -423,13 +419,13 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -439,9 +435,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -507,8 +502,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -520,11 +515,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -602,14 +596,14 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], -s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -619,9 +613,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -686,9 +679,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], -s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -700,11 +693,10 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
@@ -784,13 +776,13 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -800,9 +792,8 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -866,8 +857,8 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -879,11 +870,10 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -952,14 +942,14 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -969,9 +959,8 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1034,9 +1023,9 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], |v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1048,11 +1037,10 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1123,13 +1111,13 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -1139,9 +1127,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1205,8 +1192,8 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1218,11 +1205,10 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1292,13 +1278,13 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -1308,9 +1294,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1374,8 +1359,8 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1387,11 +1372,10 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -1461,13 +1445,13 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -1477,9 +1461,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
@@ -1543,8 +1526,8 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1556,11 +1539,10 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
@@ -1664,29 +1646,29 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -1697,13 +1679,13 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -1811,14 +1793,15 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1831,23 +1814,22 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
@@ -1965,29 +1947,29 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -1998,13 +1980,13 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -2112,14 +2094,15 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2132,23 +2115,22 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
@@ -2234,28 +2216,28 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -2266,13 +2248,13 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -2352,14 +2334,15 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2372,23 +2355,22 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
@@ -2507,28 +2489,28 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 8, v14
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_and_b32_e32 v8, 0xffffff80, v12
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
@@ -2539,13 +2521,13 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v14
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
@@ -2655,14 +2637,15 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2675,23 +2658,22 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
@@ -2774,14 +2756,14 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -|v[0:1]|, v[2:3]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -2791,9 +2773,8 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -2856,9 +2837,9 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], -|v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -2870,11 +2851,10 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -2946,13 +2926,13 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -2962,9 +2942,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -3028,8 +3007,8 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3041,11 +3020,10 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -3107,12 +3085,12 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3122,9 +3100,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3177,8 +3154,8 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3190,11 +3167,10 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3252,12 +3228,12 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3267,9 +3243,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3322,8 +3297,8 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3335,11 +3310,10 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3398,12 +3372,12 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3413,9 +3387,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3470,8 +3443,8 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3483,11 +3456,10 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3546,12 +3518,12 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3561,9 +3533,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3616,8 +3587,8 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3629,11 +3600,10 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3691,12 +3661,12 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3706,9 +3676,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3761,8 +3730,8 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3774,11 +3743,10 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3836,12 +3804,12 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3851,9 +3819,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3906,8 +3873,8 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3919,11 +3886,10 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -3982,12 +3948,12 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -3997,9 +3963,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4054,8 +4019,8 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4067,11 +4032,10 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4138,13 +4102,13 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4154,9 +4118,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -4220,8 +4183,8 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4233,11 +4196,10 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -4325,44 +4287,44 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v12
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_and_b32_e32 v12, 0xffffff80, v12
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 8, v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
-; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[8:9]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v12
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v13
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v12
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
@@ -4447,14 +4409,15 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 8, v16
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v17
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -4467,28 +4430,27 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v16
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x260
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v8
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
-; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
-; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v9
+; VI-GISEL-NEXT:    v_and_b32_e32 v9, 0xffffff80, v17
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v9
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
@@ -4550,12 +4512,12 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4565,9 +4527,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4622,8 +4583,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4635,11 +4596,10 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4706,12 +4666,12 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
@@ -4721,9 +4681,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -4776,8 +4735,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4789,11 +4748,10 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
@@ -5112,12 +5070,12 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5127,9 +5085,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5193,8 +5150,8 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5206,11 +5163,10 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v7, 0xffffff80, v10
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v7
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5279,12 +5235,12 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5294,9 +5250,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5360,8 +5315,8 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5373,11 +5328,10 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v7, 0xffffff80, v10
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v7
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5446,12 +5400,12 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5461,9 +5415,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v6, 0xffffff80, v10
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5527,8 +5480,8 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5540,11 +5493,10 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
-; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v7, 0xffffff80, v10
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v7
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5616,17 +5568,17 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -5634,9 +5586,8 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_and_b32_e32 v4, 0xffffff80, v8
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
@@ -5702,10 +5653,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 8, v8
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -5717,11 +5668,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xffffff80, v8
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]



More information about the llvm-commits mailing list