[llvm] e3fd8f8 - AMDGPU: Correctly expand f64 sqrt intrinsic

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 25 04:54:20 PDT 2023


Author: Matt Arsenault
Date: 2023-07-25T07:54:11-04:00
New Revision: e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2

URL: https://github.com/llvm/llvm-project/commit/e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2
DIFF: https://github.com/llvm/llvm-project/commit/e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2.diff

LOG: AMDGPU: Correctly expand f64 sqrt intrinsic

rocm-device-libs and llpc were avoiding using f64 sqrt
intrinsics in favor of their own expansions. Port the
expansion into the backend. Both of these users should be
updated to call the intrinsic instead.

The library and llpc expansions are slightly different.
llpc uses an ldexp to do the scale; the library uses a multiply.

Use ldexp to do the scale instead of the multiply.
I believe v_ldexp_f64 and v_mul_f64 are always the same number of
cycles, but it's cheaper to materialize the 32-bit integer constant
than the 64-bit double constant.

The libraries have another fast version of sqrt which will
be handled separately.

I am tempted to do this in an IR expansion instead. In the IR
we could take advantage of computeKnownFPClass to avoid
the 0-or-inf argument check.

Added: 
    

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/lib/Target/AMDGPU/VOP1Instructions.td
    llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
    llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
    llvm/test/CodeGen/AMDGPU/rsq.f64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index bec7b2f4ea554f..d90c83f7e8e7dd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -965,6 +965,9 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
   =========================================  ==========================================================
   LLVM Intrinsic                             Description
   =========================================  ==========================================================
+  llvm.amdgcn.sqrt                           Provides direct access to v_sqrt_f64, v_sqrt_f32 and v_sqrt_f16
+                                             (on targets with half support). Peforms sqrt function.
+
   llvm.amdgcn.log                            Provides direct access to v_log_f32 and v_log_f16
                                              (on targets with half support). Peforms log2 function.
 
@@ -980,6 +983,8 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                              inputs. Backend will optimize out denormal scaling if
                                              marked with the :ref:`afn <fastmath_afn>` flag.
 
+  :ref:`llvm.sqrt <int_sqrt>`                Implemented for double, float and half (and vectors).
+
   :ref:`llvm.log <int_log>`                  Implemented for float and half (and vectors).
 
   :ref:`llvm.exp <int_exp>`                  Implemented for float and half (and vectors).

diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index a04cbd2df10b09..a1ca57e9a7db72 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -173,6 +173,9 @@ Changes to the AMDGPU Backend
 * Implemented new 2ulp IEEE lowering strategy for float
   reciprocal. This is used by default for OpenCL on gfx9+.
 
+* `llvm.sqrt.f64` is now lowered correctly. Use `llvm.amdgcn.sqrt.f64`
+  for raw instruction access.
+
 Changes to the ARM Backend
 --------------------------
 

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index a1ff76487a12ec..5341b57477ce6d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1181,6 +1181,13 @@ class MachineIRBuilder {
                                 const SrcOp &Op0, const SrcOp &Op1,
                                 std::optional<unsigned> Flags = std::nullopt);
 
+  /// Build and insert a \p Res = G_IS_FPCLASS \p Pred\p Src, \p Mask
+  MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src,
+                                     unsigned Mask) {
+    return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res},
+                      {Src, SrcOp(static_cast<int64_t>(Mask))});
+  }
+
   /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1
   ///
   /// \pre setBasicBlock or setMI must have been called.

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 923a549b9f0626..120c00b14a3693 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -907,7 +907,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S16, S64);
 
   if (ST.has16BitInsts()) {
-    getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
+    getActionDefinitionsBuilder(G_FSQRT)
+      .legalFor({S32, S16})
+      .customFor({S64})
+      .scalarize(0)
+      .clampScalar(0, S16, S64);
+    getActionDefinitionsBuilder(G_FFLOOR)
       .legalFor({S32, S64, S16})
       .scalarize(0)
       .clampScalar(0, S16, S64);
@@ -925,7 +930,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .lower();
   } else {
     getActionDefinitionsBuilder(G_FSQRT)
-      .legalFor({S32, S64})
+      .legalFor({S32})
+      .customFor({S64})
       .scalarize(0)
       .clampScalar(0, S32, S64);
 
@@ -1996,6 +2002,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFDIV(MI, MRI, B);
   case TargetOpcode::G_FFREXP:
     return legalizeFFREXP(MI, MRI, B);
+  case TargetOpcode::G_FSQRT:
+    return legalizeFSQRT(MI, MRI, B);
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_UREM:
   case TargetOpcode::G_UDIVREM:
@@ -4829,6 +4837,90 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
+                                        MachineRegisterInfo &MRI,
+                                        MachineIRBuilder &B) const {
+  // For double type, the SQRT and RSQ instructions don't have required
+  // precision, we apply Goldschmidt's algorithm to improve the result:
+  //
+  //   y0 = rsq(x)
+  //   g0 = x * y0
+  //   h0 = 0.5 * y0
+  //
+  //   r0 = 0.5 - h0 * g0
+  //   g1 = g0 * r0 + g0
+  //   h1 = h0 * r0 + h0
+  //
+  //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+  //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
+  //   h2 = h1 * r1 + h1
+  //
+  //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+  //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
+  //
+  //   sqrt(x) = g3
+
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+  const LLT F64 = LLT::scalar(64);
+
+  Register Dst = MI.getOperand(0).getReg();
+  assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
+
+  Register X = MI.getOperand(1).getReg();
+  unsigned Flags = MI.getFlags();
+
+  auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
+
+  auto ZeroInt = B.buildConstant(S32, 0);
+  auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
+
+  // Scale up input if it is too small.
+  auto ScaleUpFactor = B.buildConstant(S32, 256);
+  auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
+  auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
+
+  auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
+                   .addReg(SqrtX.getReg(0));
+
+  auto Half = B.buildFConstant(F64, 0.5);
+  auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
+  auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
+
+  auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
+  auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
+
+  auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
+  auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
+
+  auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
+  auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
+
+  auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
+
+  auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
+  auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
+
+  auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
+
+  // Scale down the result.
+  auto ScaleDownFactor = B.buildConstant(S32, -128);
+  auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
+  SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
+
+  // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+  // with finite only or nsz because rsq(+/-0) = +/-inf
+
+  // TODO: Check for DAZ and expand to subnormals
+  auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+
+  // If x is +INF, +0, or -0, use its original value
+  B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
 // FIXME: Why do we handle this one but not other removed instructions?
 //

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1a91be1ea8d6f3..04773f275c8756 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -157,6 +157,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
 
+  bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     MachineIRBuilder &B) const;
+
   bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
                                  MachineIRBuilder &B) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 97de41cd15c2ad..3148f49ff0d530 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -219,6 +219,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::FSQRT, MVT::f64, Custom);
+
   setOperationAction(ISD::SELECT_CC,
                      {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
 
@@ -4924,7 +4926,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
            "Load should return a value and a chain");
     return Result;
   }
-
+  case ISD::FSQRT:
+    if (Op.getValueType() == MVT::f64)
+      return lowerFSQRTF64(Op, DAG);
+    return SDValue();
   case ISD::FSIN:
   case ISD::FCOS:
     return LowerTrig(Op, DAG);
@@ -9749,6 +9754,87 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
+  // For double type, the SQRT and RSQ instructions don't have required
+  // precision, we apply Goldschmidt's algorithm to improve the result:
+  //
+  //   y0 = rsq(x)
+  //   g0 = x * y0
+  //   h0 = 0.5 * y0
+  //
+  //   r0 = 0.5 - h0 * g0
+  //   g1 = g0 * r0 + g0
+  //   h1 = h0 * r0 + h0
+  //
+  //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+  //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
+  //   h2 = h1 * r1 + h1
+  //
+  //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+  //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
+  //
+  //   sqrt(x) = g3
+
+  SDNodeFlags Flags = Op->getFlags();
+
+  SDLoc DL(Op);
+
+  SDValue X = Op.getOperand(0);
+  SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
+
+  SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
+
+  SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
+
+  // Scale up input if it is too small.
+  SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
+  SDValue ScaleUp =
+      DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
+  SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
+
+  SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
+
+  SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
+
+  SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
+  SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
+
+  SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
+  SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
+
+  SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
+
+  SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
+
+  SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
+  SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
+
+  SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
+
+  SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
+  SDValue SqrtD1 =
+      DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
+
+  SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
+
+  SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
+  SDValue ScaleDown =
+      DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
+  SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
+
+  // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+  // with finite only or nsz because rsq(+/-0) = +/-inf
+
+  // TODO: Check for DAZ and expand to subnormals
+  SDValue IsZeroOrInf =
+      DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
+                  DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
+
+  // If x is +INF, +0, or -0, use its original value
+  return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
+                     Flags);
+}
+
 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4e62a056d1f25f..1745c0b9e88ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -109,6 +109,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index ec38d22670567b..1a8efc6e3df200 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -332,7 +332,7 @@ defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
 let TRANS = 1, SchedRW = [WriteTrans64] in {
 defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
-defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
 } // End TRANS = 1, SchedRW = [WriteTrans64]
 
 let TRANS = 1, SchedRW = [WriteTrans32] in {

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
index a601768858c33c..f493cc5166fdb0 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-fp.ll
@@ -52,10 +52,10 @@ define i32 @fsqrt(i32 %arg) {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; ALL-SIZE-LABEL: 'fsqrt'
@@ -63,10 +63,10 @@ define i32 @fsqrt(i32 %arg) {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = call double @llvm.sqrt.f64(double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %F32 = call float @llvm.sqrt.f32(float undef)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
index 145f11db9eef54..7f97419b96845c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir
@@ -1,9 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI  %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=SI,GCN  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefixes=VI,GCN %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9,GCN  %s
 
 ---
 name: test_fsqrt_s32
@@ -11,24 +11,12 @@ body: |
   bb.0:
     liveins: $vgpr0
 
-    ; SI-LABEL: name: test_fsqrt_s32
-    ; SI: liveins: $vgpr0
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
-    ; SI-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
-    ; VI-LABEL: name: test_fsqrt_s32
-    ; VI: liveins: $vgpr0
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
-    ; VI-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
-    ; GFX9-LABEL: name: test_fsqrt_s32
-    ; GFX9: liveins: $vgpr0
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
+    ; GCN-LABEL: name: test_fsqrt_s32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
+    ; GCN-NEXT: $vgpr0 = COPY [[FSQRT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = G_FSQRT %0
     $vgpr0 = COPY %1
@@ -40,28 +28,82 @@ body: |
   bb.0:
     liveins: $vgpr0
 
-    ; SI-LABEL: name: test_fsqrt_s64
-    ; SI: liveins: $vgpr0
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
-    ; VI-LABEL: name: test_fsqrt_s64
-    ; VI: liveins: $vgpr0
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
-    ; GFX9-LABEL: name: test_fsqrt_s64
-    ; GFX9: liveins: $vgpr0
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FSQRT]](s64)
+    ; GCN-LABEL: name: test_fsqrt_s64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]]
+    ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+    ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[COPY]], [[SELECT]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+    ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+    ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+    ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+    ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+    ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+    ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+    ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+    ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+    ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+    ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+    ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+    ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+    ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_FSQRT %0
     $vgpr0_vgpr1 = COPY %1
 
+...
+
+---
+name: test_fsqrt_s64_ninf
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: test_fsqrt_s64_ninf
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[COPY]](s64), [[C]]
+    ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+    ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[COPY]], [[SELECT]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+    ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+    ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+    ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+    ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+    ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+    ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+    ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+    ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+    ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+    ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+    ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+    ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = ninf G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+    ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = ninf G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[SELECT2]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = ninf G_FSQRT %0
+    $vgpr0_vgpr1 = COPY %1
+
 ...
 ---
 name: test_fsqrt_s16
@@ -108,33 +150,15 @@ body: |
   bb.0:
     liveins: $vgpr0_vgpr1
 
-    ; SI-LABEL: name: test_fsqrt_v2s32
-    ; SI: liveins: $vgpr0_vgpr1
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-LABEL: name: test_fsqrt_v2s32
-    ; VI: liveins: $vgpr0_vgpr1
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX9-LABEL: name: test_fsqrt_v2s32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; GCN-LABEL: name: test_fsqrt_v2s32
+    ; GCN: liveins: $vgpr0_vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
+    ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32)
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_FSQRT %0
     $vgpr0_vgpr1 = COPY %1
@@ -146,36 +170,16 @@ body: |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2
 
-    ; SI-LABEL: name: test_fsqrt_v3s32
-    ; SI: liveins: $vgpr0_vgpr1_vgpr2
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; SI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
-    ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; VI-LABEL: name: test_fsqrt_v3s32
-    ; VI: liveins: $vgpr0_vgpr1_vgpr2
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; VI-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
-    ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-LABEL: name: test_fsqrt_v3s32
-    ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
-    ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
-    ; GFX9-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ; GCN-LABEL: name: test_fsqrt_v3s32
+    ; GCN: liveins: $vgpr0_vgpr1_vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+    ; GCN-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+    ; GCN-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[UV]]
+    ; GCN-NEXT: [[FSQRT1:%[0-9]+]]:_(s32) = G_FSQRT [[UV1]]
+    ; GCN-NEXT: [[FSQRT2:%[0-9]+]]:_(s32) = G_FSQRT [[UV2]]
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSQRT]](s32), [[FSQRT1]](s32), [[FSQRT2]](s32)
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
     %1:_(<3 x  s32>) = G_FSQRT %0
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -187,33 +191,58 @@ body: |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3
 
-    ; SI-LABEL: name: test_fsqrt_v2s64
-    ; SI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; SI-NEXT: {{  $}}
-    ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; SI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
-    ; SI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
-    ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; VI-LABEL: name: test_fsqrt_v2s64
-    ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; VI-NEXT: {{  $}}
-    ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; VI-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
-    ; VI-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
-    ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
-    ; GFX9-LABEL: name: test_fsqrt_v2s64
-    ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[UV]]
-    ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s64) = G_FSQRT [[UV1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FSQRT]](s64), [[FSQRT1]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GCN-LABEL: name: test_fsqrt_v2s64
+    ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; GCN-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; GCN-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x1000000000000000
+    ; GCN-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GCN-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV]](s64), [[C]]
+    ; GCN-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
+    ; GCN-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP:%[0-9]+]]:_(s64) = G_FLDEXP [[UV]], [[SELECT]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP]](s64)
+    ; GCN-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.000000e-01
+    ; GCN-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT]], [[C3]]
+    ; GCN-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP]], [[INT]]
+    ; GCN-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[FMUL]]
+    ; GCN-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL1]], [[C3]]
+    ; GCN-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMUL1]], [[FMA]], [[FMUL1]]
+    ; GCN-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FMUL]], [[FMA]], [[FMUL]]
+    ; GCN-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[FMA1]]
+    ; GCN-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA1]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FMA3]], [[FMA2]], [[FMA1]]
+    ; GCN-NEXT: [[FNEG2:%[0-9]+]]:_(s64) = G_FNEG [[FMA4]]
+    ; GCN-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG2]], [[FMA4]], [[FLDEXP]]
+    ; GCN-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FMA5]], [[FMA2]], [[FMA4]]
+    ; GCN-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; GCN-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP1:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA6]], [[SELECT1]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP]](s64), 608
+    ; GCN-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS]](s1), [[FLDEXP]], [[FLDEXP1]]
+    ; GCN-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(olt), [[UV1]](s64), [[C]]
+    ; GCN-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C2]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP2:%[0-9]+]]:_(s64) = G_FLDEXP [[UV1]], [[SELECT3]](s32)
+    ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[FLDEXP2]](s64)
+    ; GCN-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[INT1]], [[C3]]
+    ; GCN-NEXT: [[FMUL3:%[0-9]+]]:_(s64) = G_FMUL [[FLDEXP2]], [[INT1]]
+    ; GCN-NEXT: [[FNEG3:%[0-9]+]]:_(s64) = G_FNEG [[FMUL2]]
+    ; GCN-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FNEG3]], [[FMUL3]], [[C3]]
+    ; GCN-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FMUL3]], [[FMA7]], [[FMUL3]]
+    ; GCN-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMUL2]], [[FMA7]], [[FMUL2]]
+    ; GCN-NEXT: [[FNEG4:%[0-9]+]]:_(s64) = G_FNEG [[FMA8]]
+    ; GCN-NEXT: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG4]], [[FMA8]], [[FLDEXP2]]
+    ; GCN-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMA8]]
+    ; GCN-NEXT: [[FNEG5:%[0-9]+]]:_(s64) = G_FNEG [[FMA11]]
+    ; GCN-NEXT: [[FMA12:%[0-9]+]]:_(s64) = G_FMA [[FNEG5]], [[FMA11]], [[FLDEXP2]]
+    ; GCN-NEXT: [[FMA13:%[0-9]+]]:_(s64) = G_FMA [[FMA12]], [[FMA9]], [[FMA11]]
+    ; GCN-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[C4]], [[C1]]
+    ; GCN-NEXT: [[FLDEXP3:%[0-9]+]]:_(s64) = G_FLDEXP [[FMA13]], [[SELECT4]](s32)
+    ; GCN-NEXT: [[IS_FPCLASS1:%[0-9]+]]:_(s1) = G_IS_FPCLASS [[FLDEXP2]](s64), 608
+    ; GCN-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[IS_FPCLASS1]](s1), [[FLDEXP2]], [[FLDEXP3]]
+    ; GCN-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT2]](s64), [[SELECT5]](s64)
+    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %1:_(<2 x s64>) = G_FSQRT %0
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1

diff  --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 62393930d92e2b..8bb8f6c464cd02 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,48 +1,248 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
 
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
 
 define double @v_sqrt_f64(double %x) {
-; GCN-LABEL: v_sqrt_f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_fneg(double %x) {
-; GCN-LABEL: v_sqrt_f64_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], -v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fneg:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 9
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fneg:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.neg = fneg double %x
   %result = call double @llvm.sqrt.f64(double %x.neg)
   ret double %result
 }
 
 define double @v_sqrt_f64_fabs(double %x) {
-; GCN-LABEL: v_sqrt_f64_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fabs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %result = call double @llvm.sqrt.f64(double %x.fabs)
   ret double %result
 }
 
 define double @v_sqrt_f64_fneg_fabs(double %x) {
-; GCN-LABEL: v_sqrt_f64_fneg_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 9
+; SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %x.fabs.neg = fneg double %x.fabs
   %result = call double @llvm.sqrt.f64(double %x.fabs.neg)
@@ -50,42 +250,245 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 }
 
 define double @v_sqrt_f64_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
-; GCN-LABEL: v_sqrt_f64_no_infs_attribute:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_nnan(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -98,12 +501,65 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call ninf double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -116,12 +572,65 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_afn:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_afn:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call afn double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -134,12 +643,65 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
-; GCN-LABEL: s_sqrt_f64_afn_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    ; return to shader part epilog
+; SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_mov_b32 s2, 0
+; GISEL-NEXT:    s_brev_b32 s3, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; GISEL-NEXT:    ; return to shader part epilog
   %result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
   %cast = bitcast double %result to <2 x i32>
   %cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -152,167 +714,1147 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 }
 
 define double @v_sqrt_f64_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_nnan_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_nnan_ninf_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_afn(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_afn_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64_afn:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64_afn:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64_afn:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
 }
 
 define double @v_sqrt_f64_afn_nnan(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_fabs_afn_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call double @llvm.fabs.f64(double %x)
   %result = call afn ninf double @llvm.sqrt.f64(double %fabs)
   ret double %result
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
-; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
-; GCN-LABEL: v_sqrt_f64__approx_func_fp_math:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
-; GCN-LABEL: v_sqrt_f64__enough_unsafe_attrs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
-; GCN-LABEL: v_sqrt_f64__unsafe_attr:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
 }
 
 define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
-; GCN-LABEL: v_sqrt_v2f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v2f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0
+; SDAG-NEXT:    s_brev_b32 s5, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[0:1], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[2:3], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v2f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
 }
 
 define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
-; GCN-LABEL: v_sqrt_v3f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; GCN-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; GCN-NEXT:    v_sqrt_f64_e32 v[4:5], v[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: v_sqrt_v3f64:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s6, 0
+; SDAG-NEXT:    s_brev_b32 s7, 8
+; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; SDAG-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
+; SDAG-NEXT:    v_rsq_f64_e32 v[10:11], v[4:5]
+; SDAG-NEXT:    v_mul_f64 v[12:13], v[0:1], v[6:7]
+; SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SDAG-NEXT:    v_mul_f64 v[14:15], v[2:3], v[8:9]
+; SDAG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
+; SDAG-NEXT:    v_mul_f64 v[16:17], v[4:5], v[10:11]
+; SDAG-NEXT:    v_mul_f64 v[10:11], v[10:11], 0.5
+; SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[12:13], 0.5
+; SDAG-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[14:15], 0.5
+; SDAG-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[16:17], 0.5
+; SDAG-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SDAG-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
+; SDAG-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
+; SDAG-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[12:13], v[18:19], v[6:7], v[12:13]
+; SDAG-NEXT:    v_fma_f64 v[14:15], v[20:21], v[8:9], v[14:15]
+; SDAG-NEXT:    v_fma_f64 v[16:17], v[22:23], v[10:11], v[16:17]
+; SDAG-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[12:13], v[0:1]
+; SDAG-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[14:15], v[2:3]
+; SDAG-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[16:17], v[4:5]
+; SDAG-NEXT:    v_fma_f64 v[6:7], v[18:19], v[6:7], v[12:13]
+; SDAG-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0x260
+; SDAG-NEXT:    v_fma_f64 v[8:9], v[20:21], v[8:9], v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e32 v14, 0, v12, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, 0, v12, s[4:5]
+; SDAG-NEXT:    v_fma_f64 v[10:11], v[22:23], v[10:11], v[16:17]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[6:7]
+; SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v14
+; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v13
+; SDAG-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v15
+; SDAG-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v13
+; SDAG-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_v3f64:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b32 s6, 0
+; GISEL-NEXT:    s_brev_b32 s7, 8
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; GISEL-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[6:7]
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v7
+; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
+; GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[4:5]
+; GISEL-NEXT:    v_mul_f64 v[12:13], v[6:7], 0.5
+; GISEL-NEXT:    v_mul_f64 v[6:7], v[0:1], v[6:7]
+; GISEL-NEXT:    v_mul_f64 v[14:15], v[8:9], 0.5
+; GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[8:9]
+; GISEL-NEXT:    v_mul_f64 v[16:17], v[10:11], 0.5
+; GISEL-NEXT:    v_mul_f64 v[10:11], v[4:5], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[18:19], -v[12:13], v[6:7], 0.5
+; GISEL-NEXT:    v_fma_f64 v[20:21], -v[14:15], v[8:9], 0.5
+; GISEL-NEXT:    v_fma_f64 v[22:23], -v[16:17], v[10:11], 0.5
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[20:21], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[14:15], v[14:15], v[20:21], v[14:15]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[22:23], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[16:17], v[16:17], v[22:23], v[16:17]
+; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
+; GISEL-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[6:7], v[0:1]
+; GISEL-NEXT:    v_fma_f64 v[20:21], -v[8:9], v[8:9], v[2:3]
+; GISEL-NEXT:    v_fma_f64 v[22:23], -v[10:11], v[10:11], v[4:5]
+; GISEL-NEXT:    v_fma_f64 v[6:7], v[18:19], v[12:13], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; GISEL-NEXT:    v_fma_f64 v[8:9], v[20:21], v[14:15], v[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v12, vcc
+; GISEL-NEXT:    v_fma_f64 v[10:11], v[22:23], v[16:17], v[10:11]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, v12, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, v12, s[6:7]
+; GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v14
+; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v13
+; GISEL-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v15
+; GISEL-NEXT:    v_cmp_class_f64_e64 s[6:7], v[4:5], v13
+; GISEL-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x)
   ret <3 x double> %result
 }
@@ -329,5 +1871,4 @@ attributes #2 = { "approx-func-fp-math"="true" }
 attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
 attributes #4 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GISEL: {{.*}}
-; SDAG: {{.*}}
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 74d34044b7d952..0f2eb38f44cd8d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -3,6 +3,7 @@
 declare float @llvm.amdgcn.rcp.f32(float) #0
 declare double @llvm.amdgcn.rcp.f64(double) #0
 
+declare double @llvm.amdgcn.sqrt.f64(double) #0
 declare double @llvm.sqrt.f64(double) #0
 declare float @llvm.sqrt.f32(float) #0
 
@@ -124,7 +125,15 @@ define amdgpu_kernel void @unsafe_rcp_pat_f64(ptr addrspace(1) %out, double %src
 
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f64:
 ; SI-NOT: v_rsq_f64_e32
-; SI: v_sqrt_f64
+; SI: v_rsq_f64
+; SI: v_mul_f64
+; SI: v_mul_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
 ; SI: v_rcp_f64
 define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
@@ -133,12 +142,42 @@ define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %s
   ret void
 }
 
+; FUNC-LABEL: {{^}}safe_amdgcn_sqrt_rsq_rcp_pat_f64:
+; SI-NOT: v_rsq_f64_e32
+; SI: v_sqrt_f64
+; SI: v_rcp_f64
+define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+  %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f64:
+; SI: v_rsq_f64
+; SI: v_mul_f64
+; SI: v_mul_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_rcp_f64
+; SI: buffer_store_dwordx2
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+  %sqrt = call double @llvm.sqrt.f64(double %src)
+  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_amdgcn_sqrt_rsq_rcp_pat_f64:
 ; SI: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SQRT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
-  %sqrt = call double @llvm.sqrt.f64(double %src)
+define amdgpu_kernel void @unsafe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
+  %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, ptr addrspace(1) %out, align 8
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index a20aaac1598c32..9caea1b3b3853d 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -15,8 +15,30 @@ declare double @llvm.fabs.f64(double)
 define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ; SI-SDAG-LABEL: s_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -37,8 +59,32 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -59,7 +105,29 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; VI-SDAG-LABEL: s_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -77,7 +145,31 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -107,8 +199,30 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ; SI-SDAG-LABEL: s_rsq_f64_fabs:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -129,8 +243,32 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -151,7 +289,29 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; VI-SDAG-LABEL: s_rsq_f64_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], |s[0:1]|, v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -169,7 +329,31 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |s[0:1]|
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -200,8 +384,30 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ; SI-SDAG-LABEL: s_neg_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -222,8 +428,32 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -244,7 +474,29 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; VI-SDAG-LABEL: s_neg_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -262,7 +514,31 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -292,8 +568,30 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ; SI-SDAG-LABEL: s_neg_rsq_neg_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
 ; SI-SDAG-NEXT:    s_mov_b32 s2, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -314,8 +612,32 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -336,7 +658,29 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; VI-SDAG-LABEL: s_neg_rsq_neg_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 9
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -354,7 +698,31 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -s[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -386,8 +754,30 @@ define double @v_rsq_f64(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -407,8 +797,30 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -428,7 +840,29 @@ define double @v_rsq_f64(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -445,7 +879,29 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -467,8 +923,30 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_fabs:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -488,8 +966,30 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -509,7 +1009,29 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64_fabs:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -526,7 +1048,29 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -549,8 +1093,30 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_missing_contract0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -570,8 +1136,30 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -591,7 +1179,29 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64_missing_contract0:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -608,7 +1218,29 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -630,8 +1262,30 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_missing_contract1:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -651,8 +1305,30 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -672,7 +1348,29 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64_missing_contract1:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -689,7 +1387,29 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -711,8 +1431,30 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -732,8 +1474,30 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -753,7 +1517,29 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-SDAG-LABEL: v_neg_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -770,7 +1556,29 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -792,101 +1600,222 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
-; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[18:19], v[6:7]
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v11
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v19
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT:    s_nop 0
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], 1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_div_scale_f64 v[13:14], vcc, 1.0, v[0:1], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT:    v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
 ; VI-SDAG-NEXT:    s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -894,9 +1823,48 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-LABEL: v_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -929,101 +1897,222 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[18:19], v[6:7]
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v11
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v19
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT:    s_nop 0
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v20, v19
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], -1.0
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], -1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], -1.0, v[2:3], -1.0
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT:    v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
 ; VI-SDAG-NEXT:    s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1031,9 +2120,48 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1066,8 +2194,30 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1089,43 +2239,105 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xbff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v13
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v16, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v19
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v19
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], s[4:5]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1144,9 +2356,48 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1179,105 +2430,224 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v6
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
-; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; SI-SDAG-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[10:11], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; SI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[0:1], v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[16:17], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[12:13], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13]
+; SI-SDAG-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9]
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; SI-SDAG-NEXT:    v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13]
+; SI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9]
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v13
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-SDAG-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13]
-; SI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; SI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[18:19], v[6:7]
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3ff00000
-; SI-SDAG-NEXT:    v_mul_f64 v[12:13], v[18:19], v[10:11]
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17]
-; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19]
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v11
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v19
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; SI-SDAG-NEXT:    s_nop 0
-; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13]
+; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v16, 0xbff00000
-; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[10:11], v[12:13], 1.0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[12:13], v[6:7], v[12:13]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[10:11], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[6:7]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_mul_f64 v[14:15], v[12:13], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[16:17], -v[10:11], v[14:15], v[12:13]
+; SI-GISEL-NEXT:    v_fma_f64 v[18:19], -v[8:9], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[18:19], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v13
+; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v16, v13
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
-; SI-GISEL-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_mul_f64 v[16:17], v[12:13], v[6:7]
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13]
-; SI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7]
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v11
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[6:7]
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11]
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], v[18:19], v[4:5]
-; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15]
 ; SI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v19
+; SI-GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
 ; SI-GISEL-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0
-; SI-GISEL-NEXT:    s_nop 1
-; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11]
-; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_nop 0
+; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
+; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
-; VI-SDAG-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
-; VI-SDAG-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0
-; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9]
-; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; VI-SDAG-NEXT:    v_mul_f64 v[14:15], v[12:13], v[8:9]
-; VI-SDAG-NEXT:    v_mul_f64 v[18:19], v[16:17], v[10:11]
-; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13]
-; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_div_scale_f64 v[5:6], s[6:7], v[0:1], v[0:1], -1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[7:8], s[4:5], v[2:3], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_div_scale_f64 v[17:18], s[4:5], 1.0, v[2:3], 1.0
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[9:10], v[5:6]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[11:12], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[13:14], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_div_scale_f64 v[13:14], vcc, -1.0, v[0:1], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[15:16], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[15:16], -v[5:6], v[9:10], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[19:20], -v[7:8], v[11:12], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], v[9:10], v[15:16], v[9:10]
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], v[11:12], v[19:20], v[11:12]
+; VI-SDAG-NEXT:    v_mul_f64 v[15:16], v[13:14], v[9:10]
+; VI-SDAG-NEXT:    v_mul_f64 v[19:20], v[17:18], v[11:12]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[5:6], v[15:16], v[13:14]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[7:8], v[19:20], v[17:18]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[9:10], v[15:16]
 ; VI-SDAG-NEXT:    s_mov_b64 vcc, s[4:5]
-; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
+; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -1285,9 +2655,48 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
@@ -1320,8 +2729,30 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 9
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1341,8 +2772,30 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1362,7 +2815,29 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 9
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1379,7 +2854,29 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1403,8 +2900,30 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1424,8 +2943,30 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1445,7 +2986,29 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1462,7 +3025,29 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1481,226 +3066,1033 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 }
 
 define double @v_rsq_f64__afn_fdiv(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_fdiv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_fdiv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_neg_rsq_f64__afn(double %x) {
-; SDAG-LABEL: v_neg_rsq_f64__afn:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_neg_rsq_f64__afn:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_neg_rsq_f64__afn:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_neg_rsq_f64__afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double -1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_ninf(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_ninf:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_nnan(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_nnan:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_nnan:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_nnan:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_nnan:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_rsq_f64__afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double 1.0, %sqrt
   ret double %rsq
 }
 
 define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], -1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt
   ret double %rsq
@@ -1710,8 +4102,30 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -1731,8 +4145,30 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -1752,7 +4188,29 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1769,7 +4227,29 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1788,71 +4268,250 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 }
 
 define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
-; SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
-; SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v14, 0xffffff80
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v14, vcc
+; SI-SDAG-NEXT:    v_mov_b32_e32 v15, 0x260
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v15
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[8:9], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[0:1], v[8:9]
+; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[8:9], 0.5
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[12:13], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v15
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[10:11], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v14, s[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[6:7], v[2:3]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; SI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[0:1]
-; SI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[2:3]
-; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0
-; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9]
-; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
+; SI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[8:9], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_mul_f64 v[10:11], v[0:1], v[6:7]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[8:9], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[10:11], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-SDAG-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[5:6], v[0:1]
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[7:8], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[9:10], -v[0:1], v[5:6], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[11:12], -v[2:3], v[7:8], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[9:10], v[5:6], v[5:6]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[11:12], v[7:8], v[7:8]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[6:7], v[2:3]
-; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
-; VI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[0:1]
-; VI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[2:3]
-; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0
-; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9]
-; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[2:3], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
+; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; VI-GISEL-NEXT:    v_mul_f64 v[8:9], 1.0, v[4:5]
+; VI-GISEL-NEXT:    v_mul_f64 v[10:11], 1.0, v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv contract afn nnan ninf <2 x double> <double 1.0, double 1.0>, %sqrt
@@ -1860,34 +4519,155 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 }
 
 define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
-; SDAG-LABEL: s_rsq_f64_unsafe:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; SDAG-NEXT:    v_readfirstlane_b32 s1, v1
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: s_rsq_f64_unsafe:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], s[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], s[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
-; GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GISEL-NEXT:    v_readfirstlane_b32 s0, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s1, v1
-; GISEL-NEXT:    ; return to shader part epilog
+; SI-SDAG-LABEL: s_rsq_f64_unsafe:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0x260
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v8
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-SDAG-NEXT:    ; return to shader part epilog
+;
+; SI-GISEL-LABEL: s_rsq_f64_unsafe:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_mov_b32 s2, 0
+; SI-GISEL-NEXT:    s_brev_b32 s3, 8
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; SI-GISEL-NEXT:    ; return to shader part epilog
+;
+; VI-SDAG-LABEL: s_rsq_f64_unsafe:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; VI-SDAG-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-SDAG-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-SDAG-NEXT:    s_cselect_b32 s0, 0xffffff80, 0
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0x260
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], s0
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-SDAG-NEXT:    ; return to shader part epilog
+;
+; VI-GISEL-LABEL: s_rsq_f64_unsafe:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_mov_b32 s2, 0
+; VI-GISEL-NEXT:    s_brev_b32 s3, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], s[0:1], v0
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-GISEL-NEXT:    ; return to shader part epilog
   %rsq = call contract double @llvm.sqrt.f64(double %x)
   %result = fdiv contract double 1.0, %rsq
   %cast = bitcast double %result to <2 x i32>
@@ -1901,32 +4681,147 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 }
 
 define double @v_rsq_f64_unsafe(double %x) #0 {
-; SDAG-LABEL: v_rsq_f64_unsafe:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
-; SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
-; SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_rsq_f64_unsafe:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1]
-; GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[0:1]
-; GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0
-; GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5]
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-SDAG-LABEL: v_rsq_f64_unsafe:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_rsq_f64_unsafe:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_rsq_f64_unsafe:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-SDAG-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
+; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_rsq_f64_unsafe:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-GISEL-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], 1.0, v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
+; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call double @llvm.sqrt.f64(double %x)
   %rsq = fdiv double 1.0, %sqrt
   ret double %rsq
@@ -2190,7 +5085,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-LABEL: v_div_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2210,7 +5127,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2230,7 +5169,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-LABEL: v_div_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2247,7 +5208,29 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2269,7 +5252,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-LABEL: v_div_arcp_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2289,7 +5294,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2309,7 +5336,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-LABEL: v_div_arcp_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2326,7 +5375,29 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2348,7 +5419,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_mov_b32_e32 v11, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
@@ -2368,7 +5461,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_mov_b32 s4, 0
+; SI-GISEL-NEXT:    s_brev_b32 s5, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -2388,7 +5503,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-SDAG-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2405,7 +5542,29 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v7
+; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -2427,9 +5586,30 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; SI-SDAG-NEXT:    s_brev_b32 s7, 8
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
 ; SI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
@@ -2449,10 +5629,31 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s6, 0
+; SI-GISEL-NEXT:    s_brev_b32 s7, 8
+; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
+; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
+; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7]
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[8:9], s[4:5], s[6:7], v[0:1], s[6:7]
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -2472,9 +5673,30 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40700000
+; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
+; VI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -2491,9 +5713,30 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
+; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffffff80
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x260
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
+; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5]
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -2514,3 +5757,5 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 attributes #0 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
+; GISEL: {{.*}}
+; SDAG: {{.*}}


        


More information about the llvm-commits mailing list