[llvm] [AMDGPU] Implement IR expansion for frem instruction (PR #130988)

Frederik Harwath via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 20 09:56:13 PDT 2025


https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/130988

>From b0f799670bceca75088b5b8457021dd7c1c11b85 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 25 Feb 2025 06:24:40 -0500
Subject: [PATCH 01/19] Implement IR expansion for frem instruction

This patch implements a correctly rounded expansion of the frem
instruction in LLVM IR. This is useful for target architectures where
such an expansion is too involved to be implement on ISel
Lowering. The expansion is based on the code from the AMD device libs
and has been tested successfully against the OpenCL conformance tests
on AMDGPU. The expansion is implemented in the preexisting
"expand-large-fp-convert" pass. It is enabled by a new
"shouldExpandFRemInIR" function in TargetLowering.
---
 llvm/include/llvm/CodeGen/TargetLowering.h  |    4 +
 llvm/lib/CodeGen/ExpandLargeFpConvert.cpp   |  389 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h |    2 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 4547 ++++++++++++++++---
 llvm/test/CodeGen/AMDGPU/wave32.ll          |  384 +-
 5 files changed, 4658 insertions(+), 668 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2089d47e9cbc8..b64c57fdba992 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5671,6 +5671,10 @@ class TargetLowering : public TargetLoweringBase {
                                        LoadSDNode *OriginalLoad,
                                        SelectionDAG &DAG) const;
 
+  /// Indicates whether the FRem instruction should be expanded before
+  /// ISel in the LLVM IR.
+  virtual bool shouldExpandFRemInIR() const { return false; };
+
 private:
   SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                            const SDLoc &DL, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index ee583a25214ef..31d3779eb7c9f 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -6,11 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-
 // This pass expands ‘fptoui .. to’, ‘fptosi .. to’, ‘uitofp .. to’,
 // ‘sitofp .. to’ instructions with a bitwidth above a threshold into
 // auto-generated functions. This is useful for targets like x86_64 that cannot
 // lower fp convertions with more than 128 bits.
+// Furthermore, the pass can expand FRem instructions if requested in the
+// TargetLowering for the current target.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,6 +22,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/FMF.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/PassManager.h"
@@ -28,6 +30,9 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "expand-large-fp-convert"
 
 using namespace llvm;
 
@@ -37,6 +42,376 @@ static cl::opt<unsigned>
                      cl::desc("fp convert instructions on integers with "
                               "more than <N> bits are expanded."));
 
+namespace {
+/// This class implements a precise expansion of the frem instruction.
+/// The generated code is based on the fmod implementation in the AMD device
+/// libs.
+class FRemExpander {
+  /// The IRBuilder to use for the expansion.
+  IRBuilder<> &B;
+
+  /// Floating point type of the return value and the arguments of the FRem
+  /// instructions that should be expanded.
+  Type *FremTy;
+
+  /// Floating point type to use for the computation.  This may be
+  /// wider than the \p FremTy.
+  Type *ComputeFpTy;
+
+  /// Integer type that can hold floating point values of type \p FremTY.
+  Type *IntTy;
+
+  /// Integer type used to hold the exponents returned by frexp.
+  Type *ExTy;
+
+  /// How many bits of the quotient to compute per iteration of the
+  /// algorithm, stored as a value of type \p ExTy.
+  Value *Bits;
+
+  /// Constant 1 of type \p ExTy.
+  Value *One;
+
+  /// The sign bit for floating point values of type \p FremTy.
+  const unsigned long Signbit;
+
+public:
+  static std::optional<FRemExpander> create(IRBuilder<> &B, Type *Ty) {
+    if (Ty->is16bitFPTy())
+      return FRemExpander{B, Ty, 11, 0x8000, B.getFloatTy(), B.getInt16Ty()};
+    if (Ty->isFloatTy() || Ty->isHalfTy())
+      return FRemExpander{B, Ty, 12, 0x80000000L, Ty, B.getInt32Ty()};
+    if (Ty->isDoubleTy())
+      return FRemExpander{B, Ty, 26, 0x8000000000000000L, Ty, B.getInt64Ty()};
+
+    return std::nullopt;
+  }
+
+  /// Build the FRem expansion for the numerator \p X and the
+  /// denumerator \p Y using the builder \p B.  The type of X and Y
+  /// must match the type for which the class instance has been
+  /// created. The code will be generated at the insertion point of \p
+  /// B and the insertion point will be reset at exit.
+  Value *buildFRem(Value *X, Value *Y) const;
+
+private:
+  FRemExpander(IRBuilder<> &B, Type *FremTy, short Bits, unsigned long Signbit,
+               Type *ComputeFpTy, Type *IntTy)
+      : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), IntTy(IntTy),
+        ExTy(B.getInt32Ty()), Bits(ConstantInt::get(ExTy, Bits)),
+        One(ConstantInt::get(ExTy, 1)), Signbit(Signbit) {};
+
+  Value *createLdexp(Value *Base, Value *Exp, const Twine &Name) const {
+    return B.CreateIntrinsic(Intrinsic::ldexp, {ComputeFpTy, B.getInt32Ty()},
+                             {Base, Exp}, {}, Name);
+  }
+
+  Value *createRcp(Value *V, const Twine &Name) const {
+    return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
+  }
+
+  // Helper function to build the UPDATE_AX code which is common to the
+  // loop body and the "final iteration".
+  Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const {
+    // Build:
+    //   float q = BUILTIN_RINT_ComputeFpTy(ax * ayinv);
+    //   ax = fnma(q, ay, ax);
+    //   int clt = ax < 0.0f;
+    //   float axp = ax + ay;
+    //   ax = clt ? axp : ax;
+    Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv),
+                                      {}, "q");
+    Value *AxUpdate = B.CreateIntrinsic(Intrinsic::fma, {ComputeFpTy},
+                                        {B.CreateFNeg(Q), Ay, Ax}, {}, "ax");
+    Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate,
+                              ConstantFP::get(ComputeFpTy, 0.0), "clt");
+    Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp");
+    AxUpdate = B.CreateSelect(Clt, Axp, AxUpdate, "ax");
+
+    return AxUpdate;
+  }
+
+  /// Build code to extract the exponent and mantissa of \p Src.
+  /// Return the exponent minus one for use as a loop bound and
+  /// the mantissa taken to the given \p NewExp power.
+  std::pair<Value *, Value *> buildExpAndPower(Value *Src, Value *NewExp,
+                                               const Twine &ExName,
+                                               const Twine &PowName) const {
+    // Build:
+    //   ExName = BUILTIN_FREXP_EXP_ComputeFpTy(Src) - 1;
+    //   PowName =
+    //   BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ExName),
+    //   NewExp);
+    Type *Ty = Src->getType();
+    Type *ExTy = B.getInt32Ty();
+    Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src);
+    Value *Mant = B.CreateExtractValue(Frexp, {0});
+    Value *Exp = B.CreateExtractValue(Frexp, {1});
+
+    Exp = B.CreateSub(Exp, One, ExName);
+    Value *Pow = createLdexp(Mant, NewExp, PowName);
+
+    return {Pow, Exp};
+  }
+
+  /// Build the main computation of the remainder for the case in which
+  /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the
+  /// denumerator. Add the incoming edge from the computation result
+  /// to \p RetPhi.
+  void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X,
+                                 PHINode *RetPhi) const {
+    // Build:
+    // ex = BUILTIN_FREXP_EXP_ComputeFpTy(ax) - 1;
+    // ax = BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ax),
+    // bits); ey = BUILTIN_FREXP_EXP_ComputeFpTy(ay) - 1; ay =
+    // BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ay), 1); auto
+    // [Ax, Ex]{getFrexpResults(B, AxInitial)};
+    auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax");
+    auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay");
+
+    // Build:
+    //   int nb = ex - ey;
+    //   float ayinv = MATH_FAST_RCP(ay);
+    Value *Nb = B.CreateSub(Ex, Ey, "nb");
+    Value *Ayinv = createRcp(Ay, "ayinv");
+
+    // Build: while (nb > bits)
+    BasicBlock *PreheaderBB = B.GetInsertBlock();
+    Function *Fun = PreheaderBB->getParent();
+    auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun);
+    auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun);
+
+    B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB);
+
+    // Build loop body:
+    //   UPDATE_AX
+    //   ax = BUILTIN_FLDEXP_ComputeFpTy(ax, bits);
+    //   nb -= bits;
+    // One iteration of the loop is factored out.  The code shared by
+    // the loop and this "iteration" is denoted by UPDATE_AX.
+    B.SetInsertPoint(LoopBB);
+    auto *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv");
+    NbIv->addIncoming(Nb, PreheaderBB);
+
+    auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi");
+    AxPhi->addIncoming(Ax, PreheaderBB);
+
+    Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv);
+    AxPhiUpdate = createLdexp(AxPhiUpdate, Bits, "ax_update");
+    AxPhi->addIncoming(AxPhiUpdate, LoopBB);
+    NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB);
+
+    B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB);
+
+    // Build final iteration
+    //   ax = BUILTIN_FLDEXP_ComputeFpTy(ax, nb - bits + 1);
+    //   UPDATE_AX
+    B.SetInsertPoint(ExitBB);
+
+    auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi");
+    AxPhiExit->addIncoming(Ax, PreheaderBB);
+    AxPhiExit->addIncoming(AxPhi, LoopBB);
+    auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi");
+    NbExitPhi->addIncoming(NbIv, LoopBB);
+    NbExitPhi->addIncoming(Nb, PreheaderBB);
+
+    Value *AxFinal = createLdexp(
+        AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), "ax");
+    AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
+
+    // Adjust exponent and sign
+    //    ax = BUILTIN_FLDEXP_ComputeFpTy(ax, ey);
+    //    ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax));
+    AxFinal = createLdexp(AxFinal, Ey, "ax");
+
+    Value *XAsInt = B.CreateBitCast(X, IntTy, "x_as_int");
+    if (ComputeFpTy != X->getType())
+      AxFinal = B.CreateFPTrunc(AxFinal, X->getType());
+
+    Value *AxAsInt = B.CreateBitCast(AxFinal, IntTy, "ax_as_int");
+
+    Value *Ret =
+        B.CreateXor(B.CreateAnd(XAsInt, Signbit), AxAsInt, "Remainder");
+    Ret = B.CreateBitCast(Ret, X->getType());
+
+    RetPhi->addIncoming(Ret, ExitBB);
+  }
+
+  /// Build the else-branch of the conditional in the FRem
+  /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay
+  /// = |Y|, and X is the numerator and Y the denumerator. Add the
+  /// incoming edge from the result to \p RetPhi.
+  void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const {
+    // Build:
+    // ret = ax == ay ? BUILTIN_COPYSIGN_ComputeFpTy(0.0f, x) : x;
+    Value *ZeroWithXSign = B.CreateIntrinsic(
+        Intrinsic::copysign, {FremTy}, {ConstantFP::get(FremTy, 0.0), X}, {});
+
+    Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X);
+
+    RetPhi->addIncoming(Ret, B.GetInsertBlock());
+  }
+
+  /// Adjust the result of the main computation from the FRem expansion
+  /// if NaNs or infinite values are possible.
+  Value *buildNanAndInfHandling(Value *Ret, Value *X, Value *Y) const {
+    // Build:
+    //   ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
+    //   bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
+    //   BUILTIN_ISFINITE_ComputeFpTy(x); ret = c ? ret : QNAN_ComputeFpTy;
+    // TODO Handle NaN and infinity fast math flags separately here?
+    Value *Nan = ConstantFP::getQNaN(FremTy);
+
+    Ret = B.CreateSelect(B.createIsFPClass(Y, FPClassTest::fcZero), Nan, Ret);
+    Value *C = B.CreateLogicalAnd(
+        B.CreateNot(B.createIsFPClass(Y, FPClassTest::fcNan)),
+        B.createIsFPClass(X, FPClassTest::fcFinite));
+    Ret = B.CreateSelect(C, Ret, Nan);
+
+    return Ret;
+  }
+};
+
+Value *FRemExpander::buildFRem(Value *X, Value *Y) const {
+  assert(X->getType() == FremTy && Y->getType() == FremTy);
+
+  FastMathFlags FMF = B.getFastMathFlags();
+
+  // This function generates the following code structure:
+  //   if (abs(x) > abs(y))
+  //   { ret = compute remainder }
+  //   else
+  //   { ret = x or 0 with sign of x }
+  //   Adjust ret to NaN/inf in input
+  //   return ret
+  Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax");
+  Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay");
+  if (ComputeFpTy != X->getType()) {
+    Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax");
+    Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay");
+  }
+  Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay);
+
+  PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret");
+  Value *Ret = RetPhi;
+
+  if (!FMF.noNaNs() || !FMF.noInfs())
+    Ret = buildNanAndInfHandling(Ret, X, Y);
+
+  Function *Fun = B.GetInsertBlock()->getParent();
+  auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
+  auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun);
+  SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB);
+
+  auto SavedInsertPt = B.GetInsertPoint();
+
+  // Build remainder computation for "then" branch
+  //
+  // The ordered comparison ensures that ax and ay are not NaNs
+  // in the then-branch. Furthermore, y cannot be an infinity and the
+  // check at the end of the function ensures that the result will not
+  // be used if x is an infinity.
+  FastMathFlags ComputeFMF = FMF;
+  ComputeFMF.setNoInfs();
+  ComputeFMF.setNoNaNs();
+
+  B.SetInsertPoint(ThenBB);
+  B.setFastMathFlags(ComputeFMF);
+  buildRemainderComputation(Ax, Ay, X, RetPhi);
+  B.setFastMathFlags(FMF);
+  B.CreateBr(RetPhi->getParent());
+
+  // Build "else"-branch
+  B.SetInsertPoint(ElseBB);
+  buildElseBranch(Ax, Ay, X, RetPhi);
+  B.CreateBr(RetPhi->getParent());
+
+  B.SetInsertPoint(SavedInsertPt);
+
+  return Ret;
+}
+} // namespace
+
+/// Return true if \p Op either is a constant or a selection
+/// instruction with constant operands.
+static bool isConstOrConstSelectOp(Value *Op) {
+  if (isa<Constant>(Op))
+    return true;
+
+  auto *S = dyn_cast<SelectInst>(Op);
+  if (!S)
+    return false;
+
+  return isa<Constant>(S->getTrueValue()) && isa<Constant>(S->getFalseValue());
+}
+
+/// Returns true if \p I should not be expanded because
+/// it will be eliminated during ISel.
+static bool shouldSkipExpandFRem(BinaryOperator &I) {
+  // This condition should be sufficient for DAGCombiner::visitFREM to
+  // eliminate the instruction.
+  return isConstOrConstSelectOp(I.getOperand(0)) &&
+         isConstOrConstSelectOp(I.getOperand(1));
+}
+
+static bool expandFRem(BinaryOperator &I) {
+  LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
+  if (shouldSkipExpandFRem(I)) {
+    LLVM_DEBUG(
+        dbgs() << "Skipping 'frem' instruction that should be removed by "
+                  "DAGCombiner.\n");
+    return false;
+  }
+
+  Type *ReturnTy = I.getType();
+  assert(ReturnTy->isFPOrFPVectorTy());
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  // TODO Make use of those flags for optimization?
+  FMF.setAllowReciprocal(false);
+  FMF.setAllowContract(false);
+  FMF.setApproxFunc(false);
+
+  IRBuilder<> B(&I);
+  B.setFastMathFlags(FMF);
+  B.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *ElemTy = ReturnTy->getScalarType();
+  const std::optional<FRemExpander> Expander = FRemExpander::create(B, ElemTy);
+
+  if (!Expander || isa<ScalableVectorType>(ReturnTy)) {
+    LLVM_DEBUG(dbgs() << "Cannot expand 'frem' of type " << ReturnTy << ".\n");
+    return false;
+  }
+
+  Value *Ret;
+  if (ReturnTy->isFloatingPointTy())
+    Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1));
+  else {
+    auto VecTy = cast<FixedVectorType>(ReturnTy);
+
+    // This could use SplitBlockAndInsertForEachLane but the interface
+    // is a bit awkward for a constant number of elements and it will
+    // boil down to the same code.
+    // TODO Expand the FRem instruction only once and reuse the code.
+    Value *Nums = I.getOperand(0);
+    Value *Denums = I.getOperand(1);
+    Ret = PoisonValue::get(I.getType());
+    for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+      Value *Num = B.CreateExtractElement(Nums, I);
+      Value *Denum = B.CreateExtractElement(Denums, I);
+      Value *Rem = Expander->buildFRem(Num, Denum);
+      Ret = B.CreateInsertElement(Ret, Rem, I);
+    }
+  }
+
+  I.replaceAllUsesWith(Ret);
+  Ret->takeName(&I);
+  I.removeFromParent();
+  I.dropAllReferences();
+
+  return true;
+}
+
 /// Generate code to convert a fp number to integer, replacing FPToS(U)I with
 /// the generated code. This currently generates code similarly to compiler-rt's
 /// implementations.
@@ -604,6 +979,12 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
   for (auto &I : instructions(F)) {
     switch (I.getOpcode()) {
+    case Instruction::FRem:
+      if (TLI.shouldExpandFRemInIR()) {
+        Replace.push_back(&I);
+        Modified = true;
+      }
+      break;
     case Instruction::FPToUI:
     case Instruction::FPToSI: {
       // TODO: This pass doesn't handle scalable vectors.
@@ -654,8 +1035,10 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
   while (!Replace.empty()) {
     Instruction *I = Replace.pop_back_val();
-    if (I->getOpcode() == Instruction::FPToUI ||
-        I->getOpcode() == Instruction::FPToSI) {
+    if (I->getOpcode() == Instruction::FRem)
+      expandFRem(llvm::cast<BinaryOperator>(*I));
+    else if (I->getOpcode() == Instruction::FPToUI ||
+             I->getOpcode() == Instruction::FPToSI) {
       expandFPToI(I);
     } else {
       expandIToFP(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c74dc7942f52c..b2b136c984bf4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -387,6 +387,8 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+  bool shouldExpandFRemInIR() const override { return true; };
+
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e4e6c44b051c3..e40d9690d832b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -7,59 +7,206 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    s_load_dword s3, s[4:5], 0x2
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT:    v_rcp_f32_e32 v4, v2
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT:    s_cbranch_vccz .LBB0_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:  .LBB0_2: ; %Flow18
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB0_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB0_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB0_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB0_5
+; CI-NEXT:    s_branch .LBB0_7
+; CI-NEXT:  .LBB0_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB0_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:  .LBB0_8: ; %Flow19
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s3, 0xffff, s3
+; CI-NEXT:    s_cmp_eq_u32 s3, 0
+; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
+; CI-NEXT:    s_and_b32 s2, 0xffff, s2
+; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT:    s_cselect_b32 s2, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
+; CI-NEXT:    s_and_b32 s2, s3, s2
+; CI-NEXT:    s_and_b32 s3, 1, s4
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; CI-NEXT:    s_and_b32 s2, 1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
+; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_rcp_f32_e32 v3, v2
-; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
-; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
-; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
-; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT:    v_add_f32_e32 v0, v0, v4
-; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v2, -v0, v1, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s2|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v0
+; VI-NEXT:    s_cbranch_vccz .LBB0_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_mov_b32 s1, 0
+; VI-NEXT:  .LBB0_2: ; %Flow18
+; VI-NEXT:    s_xor_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s1, s1, 1
+; VI-NEXT:    s_cmp_lg_u32 s1, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB0_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v2
+; VI-NEXT:    v_ldexp_f32 v4, v1, 11
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB0_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB0_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB0_5
+; VI-NEXT:    s_branch .LBB0_7
+; VI-NEXT:  .LBB0_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB0_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v1, s1, v0
+; VI-NEXT:  .LBB0_8: ; %Flow19
+; VI-NEXT:    v_mov_b32_e32 v0, 0x60
+; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v0
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -75,35 +222,176 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    ; implicit-def: $vgpr1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    s_load_dword s3, s[4:5], 0x2
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s3|
+; CI-NEXT:    s_mov_b32 s3, 1
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v0
+; CI-NEXT:    s_cbranch_vccz .LBB1_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s3, s2, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v0
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    s_mov_b32 s3, 0
+; CI-NEXT:  .LBB1_2: ; %Flow18
+; CI-NEXT:    s_xor_b32 s3, s3, 1
+; CI-NEXT:    s_and_b32 s3, s3, 1
+; CI-NEXT:    s_cmp_lg_u32 s3, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB1_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v1, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v1, 11
+; CI-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB1_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB1_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB1_5
+; CI-NEXT:    s_branch .LBB1_7
+; CI-NEXT:  .LBB1_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB1_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v1, s2, v0
+; CI-NEXT:  .LBB1_8: ; %Flow19
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_rcp_f32_e32 v2, v1
-; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    buffer_store_short v1, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fast_frem_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    ; implicit-def: $vgpr2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_rcp_f16_e32 v0, s3
-; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
-; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v2, -v0, s3, v1
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s2|
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s3|
+; VI-NEXT:    s_mov_b32 s3, 1
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v1, v0
+; VI-NEXT:    s_cbranch_vccz .LBB1_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s3, s2, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v1, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    s_mov_b32 s3, 0
+; VI-NEXT:  .LBB1_2: ; %Flow18
+; VI-NEXT:    s_xor_b32 s3, s3, 1
+; VI-NEXT:    s_and_b32 s3, s3, 1
+; VI-NEXT:    s_cmp_lg_u32 s3, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB1_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v1
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_ldexp_f32 v4, v2, 11
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v3, v0
+; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB1_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB1_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB1_5
+; VI-NEXT:    s_branch .LBB1_7
+; VI-NEXT:  .LBB1_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB1_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v2, s2, v0
+; VI-NEXT:  .LBB1_8: ; %Flow19
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
@@ -121,37 +409,182 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    s_load_dword s3, s[4:5], 0x2
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT:    s_cbranch_vccz .LBB2_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:  .LBB2_2: ; %Flow18
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB2_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v2, v0
 ; CI-NEXT:    v_rcp_f32_e32 v2, v1
-; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB2_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 11, v5
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; CI-NEXT:  .LBB2_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v2
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB2_5
+; CI-NEXT:    s_branch .LBB2_7
+; CI-NEXT:  .LBB2_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB2_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v5, v3
+; CI-NEXT:    v_mul_f32_e32 v2, v3, v2
+; CI-NEXT:    v_rndne_f32_e32 v2, v2
+; CI-NEXT:    v_fma_f32 v2, -v2, v1, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:  .LBB2_8: ; %Flow19
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s3, 0xffff, s3
+; CI-NEXT:    s_cmp_eq_u32 s3, 0
+; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
+; CI-NEXT:    s_and_b32 s2, 0xffff, s2
+; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT:    s_cselect_b32 s2, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
+; CI-NEXT:    s_and_b32 s2, s3, s2
+; CI-NEXT:    s_and_b32 s3, 1, s4
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; CI-NEXT:    s_and_b32 s2, 1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: unsafe_frem_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x8
+; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_rcp_f16_e32 v0, s3
-; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
-; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v2, -v0, s3, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s2|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v0
+; VI-NEXT:    s_cbranch_vccz .LBB2_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_mov_b32 s1, 0
+; VI-NEXT:  .LBB2_2: ; %Flow18
+; VI-NEXT:    s_xor_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s1, s1, 1
+; VI-NEXT:    s_cmp_lg_u32 s1, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB2_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v2
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT:    v_ldexp_f32 v4, v1, 11
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v1
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB2_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 11, v5
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; VI-NEXT:  .LBB2_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v2
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v4, v4, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB2_5
+; VI-NEXT:    s_branch .LBB2_7
+; VI-NEXT:  .LBB2_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB2_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT:    v_ldexp_f32 v3, v5, v3
+; VI-NEXT:    v_mul_f32_e32 v2, v3, v2
+; VI-NEXT:    v_rndne_f32_e32 v2, v2
+; VI-NEXT:    v_fma_f32 v2, -v2, v1, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v1, s1, v0
+; VI-NEXT:  .LBB2_8: ; %Flow19
+; VI-NEXT:    v_mov_b32_e32 v0, 0x60
+; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v0
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -168,27 +601,96 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s6, s[2:3], 0x0
-; CI-NEXT:    s_load_dword s2, s[4:5], 0x4
+; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s3
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    ; implicit-def: $vgpr0
+; CI-NEXT:    s_cbranch_vccz .LBB3_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s6
-; CI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
-; CI-NEXT:    v_rcp_f32_e32 v3, v1
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:  .LBB3_2: ; %Flow16
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB3_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; CI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; CI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB3_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB3_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB3_5
+; CI-NEXT:    s_branch .LBB3_7
+; CI-NEXT:  .LBB3_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB3_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:  .LBB3_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v1, 0x60
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s6
-; CI-NEXT:    v_trunc_f32_e32 v1, v1
-; CI-NEXT:    v_fma_f32 v0, -v1, v0, s6
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -197,25 +699,94 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s6, s[2:3], 0x0
-; VI-NEXT:    s_load_dword s2, s[4:5], 0x10
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_mov_b32 s4, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    s_cbranch_vccz .LBB3_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s6
-; VI-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
-; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:  .LBB3_2: ; %Flow16
+; VI-NEXT:    s_xor_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB3_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 12
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s6
-; VI-NEXT:    v_trunc_f32_e32 v1, v1
-; VI-NEXT:    v_fma_f32 v2, -v1, v0, s6
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB3_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB3_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB3_5
+; VI-NEXT:    s_branch .LBB3_7
+; VI-NEXT:  .LBB3_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB3_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; VI-NEXT:  .LBB3_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v1, 0x60
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -236,12 +807,82 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v1, s2
-; CI-NEXT:    v_rcp_f32_e32 v0, s3
-; CI-NEXT:    v_mul_f32_e32 v0, s2, v0
-; CI-NEXT:    v_trunc_f32_e32 v0, v0
-; CI-NEXT:    v_fma_f32 v0, -v0, s3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s3
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    ; implicit-def: $vgpr0
+; CI-NEXT:    s_cbranch_vccz .LBB4_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:  .LBB4_2: ; %Flow16
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB4_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB4_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB4_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB4_5
+; CI-NEXT:    s_branch .LBB4_7
+; CI-NEXT:  .LBB4_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB4_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    s_and_b32 s2, s2, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; CI-NEXT:  .LBB4_8: ; %Flow17
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -254,15 +895,85 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_mov_b32 s4, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_rcp_f32_e32 v0, s3
-; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
-; VI-NEXT:    v_trunc_f32_e32 v0, v0
-; VI-NEXT:    v_fma_f32 v2, -v0, s3, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    s_cbranch_vccz .LBB4_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:  .LBB4_2: ; %Flow16
+; VI-NEXT:    s_xor_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB4_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 12
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB4_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB4_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB4_5
+; VI-NEXT:    s_branch .LBB4_7
+; VI-NEXT:  .LBB4_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB4_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_and_b32 s2, s2, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; VI-NEXT:  .LBB4_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    flat_store_dword v[1:2], v0
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -280,12 +991,80 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v1, s2
-; CI-NEXT:    v_rcp_f32_e32 v0, s3
-; CI-NEXT:    v_mul_f32_e32 v0, s2, v0
-; CI-NEXT:    v_trunc_f32_e32 v0, v0
-; CI-NEXT:    v_fma_f32 v0, -v0, s3, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s3
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    ; implicit-def: $vgpr0
+; CI-NEXT:    s_cbranch_vccz .LBB5_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:  .LBB5_2: ; %Flow16
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB5_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v2, v0
+; CI-NEXT:    v_rcp_f32_e32 v2, v1
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB5_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 12, v5
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; CI-NEXT:  .LBB5_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v2
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB5_5
+; CI-NEXT:    s_branch .LBB5_7
+; CI-NEXT:  .LBB5_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB5_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v5, v3
+; CI-NEXT:    v_mul_f32_e32 v2, v3, v2
+; CI-NEXT:    v_rndne_f32_e32 v2, v2
+; CI-NEXT:    v_fma_f32 v2, -v2, v1, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:  .LBB5_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v1, 0x60
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -298,12 +1077,80 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_mov_b32 s4, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_rcp_f32_e32 v0, s3
-; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
-; VI-NEXT:    v_trunc_f32_e32 v0, v0
-; VI-NEXT:    v_fma_f32 v2, -v0, s3, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    s_cbranch_vccz .LBB5_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:  .LBB5_2: ; %Flow16
+; VI-NEXT:    s_xor_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB5_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 12
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v1
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB5_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 12, v5
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; VI-NEXT:  .LBB5_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v2
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT:    v_ldexp_f32 v4, v4, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB5_5
+; VI-NEXT:    s_branch .LBB5_7
+; VI-NEXT:  .LBB5_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB5_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v3, v5, v3
+; VI-NEXT:    v_mul_f32_e32 v2, v3, v2
+; VI-NEXT:    v_rndne_f32_e32 v2, v2
+; VI-NEXT:    v_fma_f32 v2, -v2, v1, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; VI-NEXT:  .LBB5_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v1, 0x60
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -321,25 +1168,109 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s6, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
-; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
-; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
-; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT:    s_cbranch_vccz .LBB6_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_brev_b32 s7, 1
+; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    v_mov_b32_e32 v0, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s7
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB6_2: ; %Flow16
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB6_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; CI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB6_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT:  .LBB6_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT:    s_cbranch_vccnz .LBB6_5
+; CI-NEXT:    s_branch .LBB6_7
+; CI-NEXT:  .LBB6_6:
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:  .LBB6_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_brev_b32 s7, 1
+; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; CI-NEXT:  .LBB6_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v2, 0x60
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
+; CI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -349,26 +1280,110 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s6, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
-; VI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
-; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
-; VI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:    s_cbranch_vccz .LBB6_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_brev_b32 s7, 1
+; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:  .LBB6_2: ; %Flow16
+; VI-NEXT:    s_xor_b32 s6, s6, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB6_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB6_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT:  .LBB6_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT:    s_cbranch_vccnz .LBB6_5
+; VI-NEXT:    s_branch .LBB6_7
+; VI-NEXT:  .LBB6_6:
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:  .LBB6_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_brev_b32 s7, 1
+; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; VI-NEXT:  .LBB6_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v2, 0x60
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
+; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -384,24 +1399,99 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s6, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
-; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT:    s_cbranch_vccz .LBB7_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_brev_b32 s7, 1
+; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    v_mov_b32_e32 v0, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s7
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB7_2: ; %Flow16
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB7_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; CI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB7_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT:  .LBB7_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT:    s_cbranch_vccnz .LBB7_5
+; CI-NEXT:    s_branch .LBB7_7
+; CI-NEXT:  .LBB7_6:
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:  .LBB7_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_brev_b32 s5, 1
+; CI-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s3, v1
+; CI-NEXT:  .LBB7_8: ; %Flow17
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -409,22 +1499,97 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s6, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
-; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:    s_cbranch_vccz .LBB7_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_brev_b32 s7, 1
+; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:  .LBB7_2: ; %Flow16
+; VI-NEXT:    s_xor_b32 s6, s6, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB7_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB7_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT:  .LBB7_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT:    s_cbranch_vccnz .LBB7_5
+; VI-NEXT:    s_branch .LBB7_7
+; VI-NEXT:  .LBB7_6:
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:  .LBB7_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_brev_b32 s5, 1
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s3, v1
+; VI-NEXT:  .LBB7_8: ; %Flow17
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -441,24 +1606,107 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s6, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
-; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT:    s_cbranch_vccz .LBB8_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_brev_b32 s7, 1
+; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    v_mov_b32_e32 v0, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s7
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB8_2: ; %Flow16
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB8_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; CI-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; CI-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; CI-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; CI-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; CI-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; CI-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; CI-NEXT:    s_cbranch_vccnz .LBB8_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT:  .LBB8_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT:    s_cbranch_vccnz .LBB8_5
+; CI-NEXT:    s_branch .LBB8_7
+; CI-NEXT:  .LBB8_6:
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:  .LBB8_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_brev_b32 s7, 1
+; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; CI-NEXT:  .LBB8_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v2, 0x60
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
+; CI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -466,23 +1714,106 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s6, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_rcp_f64_e32 v[0:1], s[4:5]
-; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT:    v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT:    v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:    s_cbranch_vccz .LBB8_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_brev_b32 s7, 1
+; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT:    v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:  .LBB8_2: ; %Flow16
+; VI-NEXT:    s_xor_b32 s6, s6, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB8_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; VI-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; VI-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; VI-NEXT:    v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; VI-NEXT:    s_cbranch_vccnz .LBB8_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT:  .LBB8_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT:    s_cbranch_vccnz .LBB8_5
+; VI-NEXT:    s_branch .LBB8_7
+; VI-NEXT:  .LBB8_6:
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:  .LBB8_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_brev_b32 s7, 1
+; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; VI-NEXT:  .LBB8_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v2, 0x60
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
+; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -499,100 +1830,393 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT:    s_lshr_b32 s4, s2, 16
-; CI-NEXT:    s_lshr_b32 s5, s3, 16
-; CI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT:    v_rcp_f32_e32 v4, v2
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT:    s_cbranch_vccz .LBB9_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:  .LBB9_2: ; %Flow60
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB9_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s5
-; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB9_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB9_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB9_5
+; CI-NEXT:    s_branch .LBB9_7
+; CI-NEXT:  .LBB9_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB9_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, v1
-; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v2, v1
-; CI-NEXT:    v_rcp_f32_e32 v5, v3
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:  .LBB9_8: ; %Flow61
+; CI-NEXT:    s_lshr_b32 s4, s2, 16
+; CI-NEXT:    s_lshr_b32 s5, s3, 16
+; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s4|
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s5|
+; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    ; implicit-def: $vgpr1
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
+; CI-NEXT:    s_cbranch_vccz .LBB9_10
+; CI-NEXT:  ; %bb.9: ; %frem.else20
+; CI-NEXT:    s_and_b32 s6, s4, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s6
+; CI-NEXT:    v_mov_b32_e32 v4, s4
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB9_10: ; %Flow56
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB9_16
+; CI-NEXT:  ; %bb.11: ; %frem.compute19
+; CI-NEXT:    v_frexp_mant_f32_e32 v4, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v2
+; CI-NEXT:    v_ldexp_f32_e64 v2, v4, 1
+; CI-NEXT:    v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v1, v3
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT:    v_ldexp_f32_e64 v5, v1, 11
+; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v9, v4
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; CI-NEXT:    v_mul_f32_e32 v6, v4, v5
-; CI-NEXT:    v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; CI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; CI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; CI-NEXT:    v_fma_f32 v4, -v4, v10, v8
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
-; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
+; CI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB9_14
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 11, v6
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT:  .LBB9_13: ; %frem.loop_body27
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v7, v5, v2
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB9_13
+; CI-NEXT:    s_branch .LBB9_15
+; CI-NEXT:  .LBB9_14:
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_add_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    s_and_b32 s6, s4, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v1, s6, v1
+; CI-NEXT:  .LBB9_16: ; %Flow57
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s3, 0xffff, s3
+; CI-NEXT:    s_cmp_eq_u32 s3, 0
+; CI-NEXT:    s_cselect_b32 s6, 1, 0
+; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
+; CI-NEXT:    s_and_b32 s2, 0xffff, s2
+; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT:    s_cselect_b32 s2, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
+; CI-NEXT:    s_and_b32 s2, s3, s2
+; CI-NEXT:    s_and_b32 s3, s5, 0x7fff
+; CI-NEXT:    s_and_b32 s3, 0xffff, s3
+; CI-NEXT:    s_cmp_eq_u32 s3, 0
+; CI-NEXT:    s_cselect_b32 s5, 1, 0
+; CI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; CI-NEXT:    s_and_b32 s4, 0xffff, s4
+; CI-NEXT:    s_cmpk_lt_u32 s4, 0x7c00
+; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
+; CI-NEXT:    s_and_b32 s3, s3, s4
+; CI-NEXT:    s_and_b32 s4, 1, s6
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; CI-NEXT:    s_and_b32 s2, 1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s5
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s3
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
+; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    v_rcp_f32_e32 v3, v2
-; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
-; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
-; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
-; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT:    v_add_f32_e32 v0, v0, v4
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s2|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
+; VI-NEXT:    s_cbranch_vccz .LBB9_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT:    s_mov_b32 s1, 0
+; VI-NEXT:  .LBB9_2: ; %Flow60
+; VI-NEXT:    s_xor_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s1, s1, 1
+; VI-NEXT:    s_cmp_lg_u32 s1, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB9_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; VI-NEXT:    v_ldexp_f32 v1, v3, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 11
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB9_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB9_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB9_5
+; VI-NEXT:    s_branch .LBB9_7
+; VI-NEXT:  .LBB9_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB9_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s5
-; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; VI-NEXT:    v_rcp_f32_e32 v4, v3
-; VI-NEXT:    v_mul_f32_e32 v5, v1, v4
-; VI-NEXT:    v_mad_f32 v6, -v3, v5, v1
-; VI-NEXT:    v_mac_f32_e32 v5, v6, v4
-; VI-NEXT:    v_mad_f32 v1, -v3, v5, v1
-; VI-NEXT:    v_mul_f32_e32 v1, v1, v4
-; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; VI-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v0, s1, v0
+; VI-NEXT:  .LBB9_8: ; %Flow61
+; VI-NEXT:    s_lshr_b32 s4, s0, 16
+; VI-NEXT:    s_lshr_b32 s6, s2, 16
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s4|
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s6|
+; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
+; VI-NEXT:    s_cbranch_vccz .LBB9_10
+; VI-NEXT:  ; %bb.9: ; %frem.else20
+; VI-NEXT:    s_and_b32 s1, s4, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT:    s_mov_b32 s1, 0
+; VI-NEXT:  .LBB9_10: ; %Flow56
+; VI-NEXT:    s_xor_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s1, s1, 1
+; VI-NEXT:    s_cmp_lg_u32 s1, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB9_16
+; VI-NEXT:  ; %bb.11: ; %frem.compute19
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v2
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v2
+; VI-NEXT:    v_ldexp_f32 v2, v4, 1
+; VI-NEXT:    v_div_scale_f32 v4, s[10:11], v2, v2, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v5, v1, 11
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v9, v4
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; VI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; VI-NEXT:    v_fma_f32 v4, -v4, v10, v8
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB9_14
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 11, v6
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT:  .LBB9_13: ; %frem.loop_body27
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v7, v5, v2
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v5, v5, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB9_13
+; VI-NEXT:    s_branch .LBB9_15
+; VI-NEXT:  .LBB9_14:
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT:    v_ldexp_f32 v3, v6, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    v_ldexp_f32 v1, v2, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s4
-; VI-NEXT:    v_trunc_f16_e32 v1, v1
-; VI-NEXT:    v_fma_f16 v1, -v1, v2, s4
+; VI-NEXT:    s_and_b32 s1, s4, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v1, s1, v1
+; VI-NEXT:  .LBB9_16: ; %Flow57
+; VI-NEXT:    v_mov_b32_e32 v2, 0x60
+; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v3
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s6, v2
+; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s6, 3
+; VI-NEXT:    v_cmp_class_f16_e64 s[4:5], s4, v3
+; VI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[0:1]
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -608,174 +2232,757 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT:    s_lshr_b32 s8, s2, 16
-; CI-NEXT:    s_lshr_b32 s9, s3, 16
-; CI-NEXT:    s_lshr_b32 s10, s4, 16
-; CI-NEXT:    v_div_scale_f32 v2, s[6:7], v1, v1, v0
-; CI-NEXT:    s_lshr_b32 s11, s5, 16
-; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT:    v_rcp_f32_e32 v4, v2
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s4|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT:    s_cbranch_vccz .LBB10_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s6, s2, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT:    v_mov_b32_e32 v0, s6
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB10_2: ; %Flow144
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB10_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, s8
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s10
-; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB10_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB10_5
+; CI-NEXT:    s_branch .LBB10_7
+; CI-NEXT:  .LBB10_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB10_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_div_scale_f32 v3, s[6:7], v2, v2, v1
-; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v2, v1
-; CI-NEXT:    v_rcp_f32_e32 v5, v3
+; CI-NEXT:    s_and_b32 s6, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; CI-NEXT:  .LBB10_8: ; %Flow145
+; CI-NEXT:    s_lshr_b32 s6, s2, 16
+; CI-NEXT:    s_lshr_b32 s7, s4, 16
+; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s6|
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s7|
+; CI-NEXT:    s_mov_b32 s8, 1
+; CI-NEXT:    ; implicit-def: $vgpr1
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
+; CI-NEXT:    s_cbranch_vccz .LBB10_10
+; CI-NEXT:  ; %bb.9: ; %frem.else20
+; CI-NEXT:    s_and_b32 s8, s6, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
+; CI-NEXT:    v_mov_b32_e32 v1, s8
+; CI-NEXT:    v_mov_b32_e32 v4, s6
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    s_mov_b32 s8, 0
+; CI-NEXT:  .LBB10_10: ; %Flow140
+; CI-NEXT:    s_xor_b32 s8, s8, 1
+; CI-NEXT:    s_and_b32 s8, s8, 1
+; CI-NEXT:    s_cmp_lg_u32 s8, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB10_16
+; CI-NEXT:  ; %bb.11: ; %frem.compute19
+; CI-NEXT:    v_frexp_mant_f32_e32 v4, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v2
+; CI-NEXT:    v_ldexp_f32_e64 v2, v4, 1
+; CI-NEXT:    v_div_scale_f32 v4, s[8:9], v2, v2, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v1, v3
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v3
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT:    v_ldexp_f32_e64 v5, v1, 11
+; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v9, v4
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; CI-NEXT:    v_mul_f32_e32 v6, v4, v5
-; CI-NEXT:    v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; CI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; CI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; CI-NEXT:    v_fma_f32 v4, -v4, v10, v8
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
-; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, s5
+; CI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_14
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 11, v6
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT:  .LBB10_13: ; %frem.loop_body27
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v7, v5, v2
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB10_13
+; CI-NEXT:    s_branch .LBB10_15
+; CI-NEXT:  .LBB10_14:
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_add_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, v2
-; CI-NEXT:    v_div_scale_f32 v5, vcc, v2, v3, v2
-; CI-NEXT:    v_rcp_f32_e32 v6, v4
+; CI-NEXT:    s_and_b32 s8, s6, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v1, s8, v1
+; CI-NEXT:  .LBB10_16: ; %Flow141
+; CI-NEXT:    v_cvt_f32_f16_e64 v4, |s3|
+; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s5|
+; CI-NEXT:    s_mov_b32 s8, 1
+; CI-NEXT:    ; implicit-def: $vgpr2
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; CI-NEXT:    s_cbranch_vccz .LBB10_18
+; CI-NEXT:  ; %bb.17: ; %frem.else56
+; CI-NEXT:    s_and_b32 s8, s3, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; CI-NEXT:    v_mov_b32_e32 v2, s8
+; CI-NEXT:    v_mov_b32_e32 v5, s3
+; CI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; CI-NEXT:    s_mov_b32 s8, 0
+; CI-NEXT:  .LBB10_18: ; %Flow136
+; CI-NEXT:    s_xor_b32 s8, s8, 1
+; CI-NEXT:    s_and_b32 s8, s8, 1
+; CI-NEXT:    s_cmp_lg_u32 s8, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB10_24
+; CI-NEXT:  ; %bb.19: ; %frem.compute55
+; CI-NEXT:    v_frexp_mant_f32_e32 v5, v3
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; CI-NEXT:    v_ldexp_f32_e64 v3, v5, 1
+; CI-NEXT:    v_div_scale_f32 v5, s[8:9], v3, v3, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v7
+; CI-NEXT:    v_ldexp_f32_e64 v6, v2, 11
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; CI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v10, v5
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v7, -v4, v6, 1.0
-; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; CI-NEXT:    v_mul_f32_e32 v7, v5, v6
-; CI-NEXT:    v_fma_f32 v8, -v4, v7, v5
-; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; CI-NEXT:    v_fma_f32 v4, -v4, v7, v5
+; CI-NEXT:    v_fma_f32 v11, -v5, v10, 1.0
+; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; CI-NEXT:    v_mul_f32_e32 v11, v9, v10
+; CI-NEXT:    v_fma_f32 v12, -v5, v11, v9
+; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; CI-NEXT:    v_fma_f32 v5, -v5, v11, v9
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
-; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v2
-; CI-NEXT:    v_trunc_f32_e32 v4, v4
-; CI-NEXT:    v_fma_f32 v2, -v4, v3, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, s9
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, s11
+; CI-NEXT:    v_div_fmas_f32 v5, v5, v10, v11
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v4
+; CI-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_22
+; CI-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 11, v7
+; CI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
+; CI-NEXT:  .LBB10_21: ; %frem.loop_body63
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v6
+; CI-NEXT:    v_mul_f32_e32 v6, v7, v5
+; CI-NEXT:    v_rndne_f32_e32 v6, v6
+; CI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT:    v_add_f32_e32 v8, v6, v3
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT:    v_ldexp_f32_e64 v6, v6, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; CI-NEXT:    s_cbranch_vccnz .LBB10_21
+; CI-NEXT:    s_branch .LBB10_23
+; CI-NEXT:  .LBB10_22:
+; CI-NEXT:    v_mov_b32_e32 v7, v6
+; CI-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -10, v4
+; CI-NEXT:    v_ldexp_f32_e32 v4, v7, v4
+; CI-NEXT:    v_mul_f32_e32 v5, v4, v5
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_div_scale_f32 v5, s[2:3], v4, v4, v3
-; CI-NEXT:    v_div_scale_f32 v6, vcc, v3, v4, v3
-; CI-NEXT:    v_rcp_f32_e32 v7, v5
+; CI-NEXT:    s_and_b32 s8, s3, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v2, s8, v2
+; CI-NEXT:  .LBB10_24: ; %Flow137
+; CI-NEXT:    s_lshr_b32 s8, s3, 16
+; CI-NEXT:    s_lshr_b32 s9, s5, 16
+; CI-NEXT:    v_cvt_f32_f16_e64 v5, |s8|
+; CI-NEXT:    v_cvt_f32_f16_e64 v4, |s9|
+; CI-NEXT:    s_mov_b32 s10, 1
+; CI-NEXT:    ; implicit-def: $vgpr3
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v5, v4
+; CI-NEXT:    s_cbranch_vccz .LBB10_26
+; CI-NEXT:  ; %bb.25: ; %frem.else92
+; CI-NEXT:    s_and_b32 s10, s8, 0xffff8000
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v5, v4
+; CI-NEXT:    v_mov_b32_e32 v3, s10
+; CI-NEXT:    v_mov_b32_e32 v6, s8
+; CI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:  .LBB10_26: ; %Flow132
+; CI-NEXT:    s_xor_b32 s10, s10, 1
+; CI-NEXT:    s_and_b32 s10, s10, 1
+; CI-NEXT:    s_cmp_lg_u32 s10, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB10_32
+; CI-NEXT:  ; %bb.27: ; %frem.compute91
+; CI-NEXT:    v_frexp_mant_f32_e32 v6, v4
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v4
+; CI-NEXT:    v_ldexp_f32_e64 v4, v6, 1
+; CI-NEXT:    v_div_scale_f32 v6, s[10:11], v4, v4, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v5
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v5
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -1, v8
+; CI-NEXT:    v_ldexp_f32_e64 v7, v3, 11
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v9
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
+; CI-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v11, v6
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
-; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
-; CI-NEXT:    v_mul_f32_e32 v8, v6, v7
-; CI-NEXT:    v_fma_f32 v9, -v5, v8, v6
-; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
-; CI-NEXT:    v_fma_f32 v5, -v5, v8, v6
+; CI-NEXT:    v_fma_f32 v12, -v6, v11, 1.0
+; CI-NEXT:    v_fma_f32 v11, v12, v11, v11
+; CI-NEXT:    v_mul_f32_e32 v12, v10, v11
+; CI-NEXT:    v_fma_f32 v13, -v6, v12, v10
+; CI-NEXT:    v_fma_f32 v12, v13, v11, v12
+; CI-NEXT:    v_fma_f32 v6, -v6, v12, v10
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
+; CI-NEXT:    v_div_fmas_f32 v6, v6, v11, v12
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v5
+; CI-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_30
+; CI-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 11, v8
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; CI-NEXT:  .LBB10_29: ; %frem.loop_body99
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v8, v7
+; CI-NEXT:    v_mul_f32_e32 v7, v8, v6
+; CI-NEXT:    v_rndne_f32_e32 v7, v7
+; CI-NEXT:    v_fma_f32 v7, -v7, v4, v8
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT:    v_add_f32_e32 v9, v7, v4
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT:    v_ldexp_f32_e64 v7, v7, 11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v5
+; CI-NEXT:    s_cbranch_vccnz .LBB10_29
+; CI-NEXT:    s_branch .LBB10_31
+; CI-NEXT:  .LBB10_30:
+; CI-NEXT:    v_mov_b32_e32 v8, v7
+; CI-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -10, v5
+; CI-NEXT:    v_ldexp_f32_e32 v5, v8, v5
+; CI-NEXT:    v_mul_f32_e32 v6, v5, v6
+; CI-NEXT:    v_rndne_f32_e32 v6, v6
+; CI-NEXT:    v_fma_f32 v5, -v6, v4, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v4, v5, v4
+; CI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    s_and_b32 s10, s8, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v3, s10, v3
+; CI-NEXT:  .LBB10_32: ; %Flow133
+; CI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; CI-NEXT:    s_and_b32 s4, 0xffff, s4
+; CI-NEXT:    s_cmp_eq_u32 s4, 0
+; CI-NEXT:    s_cselect_b32 s10, 1, 0
+; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
+; CI-NEXT:    s_and_b32 s2, 0xffff, s2
+; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT:    s_cselect_b32 s2, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s4, 0x7c00
+; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_and_b32 s2, s4, s2
+; CI-NEXT:    s_and_b32 s4, s7, 0x7fff
+; CI-NEXT:    s_and_b32 s4, 0xffff, s4
+; CI-NEXT:    s_cmp_eq_u32 s4, 0
+; CI-NEXT:    s_cselect_b32 s7, 1, 0
+; CI-NEXT:    s_and_b32 s6, s6, 0x7fff
+; CI-NEXT:    s_and_b32 s6, 0xffff, s6
+; CI-NEXT:    s_cmpk_lt_u32 s6, 0x7c00
+; CI-NEXT:    s_cselect_b32 s6, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s4, 0x7c00
+; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_and_b32 s5, s5, 0x7fff
+; CI-NEXT:    s_and_b32 s4, s4, s6
+; CI-NEXT:    s_and_b32 s5, 0xffff, s5
+; CI-NEXT:    s_cmp_eq_u32 s5, 0
+; CI-NEXT:    s_cselect_b32 s6, 1, 0
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s3, 0xffff, s3
+; CI-NEXT:    s_cmpk_lt_u32 s3, 0x7c00
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s5, 0x7c00
+; CI-NEXT:    s_cselect_b32 s5, 1, 0
+; CI-NEXT:    s_and_b32 s3, s5, s3
+; CI-NEXT:    s_and_b32 s5, s9, 0x7fff
+; CI-NEXT:    s_and_b32 s5, 0xffff, s5
+; CI-NEXT:    s_cmp_eq_u32 s5, 0
+; CI-NEXT:    s_cselect_b32 s9, 1, 0
+; CI-NEXT:    s_and_b32 s8, s8, 0x7fff
+; CI-NEXT:    s_and_b32 s8, 0xffff, s8
+; CI-NEXT:    s_cmpk_lt_u32 s8, 0x7c00
+; CI-NEXT:    s_cselect_b32 s8, 1, 0
+; CI-NEXT:    s_cmpk_le_u32 s5, 0x7c00
+; CI-NEXT:    s_cselect_b32 s5, 1, 0
+; CI-NEXT:    s_and_b32 s5, s5, s8
+; CI-NEXT:    s_and_b32 s8, 1, s10
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
+; CI-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; CI-NEXT:    s_and_b32 s2, 1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s7
+; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s4
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    s_and_b32 s2, 1, s6
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s3
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s9
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s2, 1, s5
+; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v3
-; CI-NEXT:    v_trunc_f32_e32 v5, v5
-; CI-NEXT:    v_fma_f32 v3, -v5, v4, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; CI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[18:19], 0x0
+; VI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x20
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
-; VI-NEXT:    s_lshr_b32 s8, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s4
-; VI-NEXT:    s_lshr_b32 s6, s2, 16
-; VI-NEXT:    v_rcp_f32_e32 v3, v2
-; VI-NEXT:    s_lshr_b32 s9, s5, 16
-; VI-NEXT:    s_lshr_b32 s7, s3, 16
-; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
-; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
-; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
-; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT:    v_add_f32_e32 v0, v0, v4
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s8|
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s10|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
+; VI-NEXT:    s_cbranch_vccz .LBB10_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s0, s8, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:  .LBB10_2: ; %Flow144
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB10_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; VI-NEXT:    v_ldexp_f32 v1, v3, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e32 v0, v2
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 11
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB10_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB10_5
+; VI-NEXT:    s_branch .LBB10_7
+; VI-NEXT:  .LBB10_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB10_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, s8
-; VI-NEXT:    v_mov_b32_e32 v2, s8
-; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT:    v_cvt_f32_f16_e32 v1, s6
-; VI-NEXT:    v_rcp_f32_e32 v4, v3
-; VI-NEXT:    v_mul_f32_e32 v5, v1, v4
-; VI-NEXT:    v_mad_f32 v6, -v3, v5, v1
-; VI-NEXT:    v_mac_f32_e32 v5, v6, v4
-; VI-NEXT:    v_mad_f32 v1, -v3, v5, v1
-; VI-NEXT:    v_mul_f32_e32 v1, v1, v4
-; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; VI-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-NEXT:    s_and_b32 s0, s8, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v0, s0, v0
+; VI-NEXT:  .LBB10_8: ; %Flow145
+; VI-NEXT:    s_lshr_b32 s4, s8, 16
+; VI-NEXT:    s_lshr_b32 s6, s10, 16
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s4|
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s6|
+; VI-NEXT:    s_mov_b32 s0, 1
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
+; VI-NEXT:    s_cbranch_vccz .LBB10_10
+; VI-NEXT:  ; %bb.9: ; %frem.else20
+; VI-NEXT:    s_and_b32 s0, s4, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:  .LBB10_10: ; %Flow140
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB10_16
+; VI-NEXT:  ; %bb.11: ; %frem.compute19
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v2
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v2
+; VI-NEXT:    v_ldexp_f32 v2, v4, 1
+; VI-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e32 v1, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v3
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v5, v1, 11
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v9, v4
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; VI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; VI-NEXT:    v_fma_f32 v4, -v4, v10, v8
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_14
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 11, v6
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT:  .LBB10_13: ; %frem.loop_body27
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v7, v5, v2
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v5, v5, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB10_13
+; VI-NEXT:    s_branch .LBB10_15
+; VI-NEXT:  .LBB10_14:
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT:    v_ldexp_f32 v3, v6, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    v_ldexp_f32 v1, v2, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT:    v_cvt_f32_f16_e32 v4, s5
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s6
-; VI-NEXT:    v_trunc_f16_e32 v1, v1
-; VI-NEXT:    v_fma_f16 v1, -v1, v2, s6
-; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_rcp_f32_e32 v5, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_mul_f32_e32 v6, v2, v5
-; VI-NEXT:    v_mad_f32 v7, -v4, v6, v2
-; VI-NEXT:    v_mac_f32_e32 v6, v7, v5
-; VI-NEXT:    v_mad_f32 v2, -v4, v6, v2
-; VI-NEXT:    v_mul_f32_e32 v2, v2, v5
-; VI-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
-; VI-NEXT:    v_add_f32_e32 v2, v2, v6
+; VI-NEXT:    s_and_b32 s0, s4, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v1, s0, v1
+; VI-NEXT:  .LBB10_16: ; %Flow141
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |s9|
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s11|
+; VI-NEXT:    s_mov_b32 s0, 1
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; VI-NEXT:    s_cbranch_vccz .LBB10_18
+; VI-NEXT:  ; %bb.17: ; %frem.else56
+; VI-NEXT:    s_and_b32 s0, s9, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:  .LBB10_18: ; %Flow136
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB10_24
+; VI-NEXT:  ; %bb.19: ; %frem.compute55
+; VI-NEXT:    v_frexp_mant_f32_e32 v5, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT:    v_ldexp_f32 v3, v5, 1
+; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v3, v3, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v7
+; VI-NEXT:    v_ldexp_f32 v6, v2, 11
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v2
+; VI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v10, v5
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v11, -v5, v10, 1.0
+; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v9, v10
+; VI-NEXT:    v_fma_f32 v12, -v5, v11, v9
+; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; VI-NEXT:    v_fma_f32 v5, -v5, v11, v9
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v5, v5, v10, v11
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v4
+; VI-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_22
+; VI-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 11, v7
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v8
+; VI-NEXT:  .LBB10_21: ; %frem.loop_body63
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:    v_mul_f32_e32 v6, v7, v5
+; VI-NEXT:    v_rndne_f32_e32 v6, v6
+; VI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT:    v_add_f32_e32 v8, v6, v3
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT:    v_ldexp_f32 v6, v6, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; VI-NEXT:    s_cbranch_vccnz .LBB10_21
+; VI-NEXT:    s_branch .LBB10_23
+; VI-NEXT:  .LBB10_22:
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -10, v4
+; VI-NEXT:    v_ldexp_f32 v4, v7, v4
+; VI-NEXT:    v_mul_f32_e32 v5, v4, v5
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT:    v_mov_b32_e32 v4, s9
-; VI-NEXT:    v_div_fixup_f16 v2, v2, v3, s3
-; VI-NEXT:    v_trunc_f16_e32 v2, v2
-; VI-NEXT:    v_fma_f16 v2, -v2, v3, s3
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, s7
-; VI-NEXT:    v_rcp_f32_e32 v6, v5
-; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
-; VI-NEXT:    v_mad_f32 v8, -v5, v7, v3
-; VI-NEXT:    v_mac_f32_e32 v7, v8, v6
-; VI-NEXT:    v_mad_f32 v3, -v5, v7, v3
-; VI-NEXT:    v_mul_f32_e32 v3, v3, v6
-; VI-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
-; VI-NEXT:    v_add_f32_e32 v3, v3, v7
+; VI-NEXT:    s_and_b32 s0, s9, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v2, s0, v2
+; VI-NEXT:  .LBB10_24: ; %Flow137
+; VI-NEXT:    s_lshr_b32 s12, s9, 16
+; VI-NEXT:    s_lshr_b32 s14, s11, 16
+; VI-NEXT:    v_cvt_f32_f16_e64 v5, |s12|
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |s14|
+; VI-NEXT:    s_mov_b32 s0, 1
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v5, v4
+; VI-NEXT:    s_cbranch_vccz .LBB10_26
+; VI-NEXT:  ; %bb.25: ; %frem.else92
+; VI-NEXT:    s_and_b32 s0, s12, 0xffff8000
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v5, v4
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v6, s12
+; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:  .LBB10_26: ; %Flow132
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB10_32
+; VI-NEXT:  ; %bb.27: ; %frem.compute91
+; VI-NEXT:    v_frexp_mant_f32_e32 v6, v4
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v4
+; VI-NEXT:    v_ldexp_f32 v4, v6, 1
+; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e32 v3, v5
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v5
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -1, v8
+; VI-NEXT:    v_ldexp_f32 v7, v3, 11
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v9
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v11, v6
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v12, -v6, v11, 1.0
+; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
+; VI-NEXT:    v_mul_f32_e32 v12, v10, v11
+; VI-NEXT:    v_fma_f32 v13, -v6, v12, v10
+; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
+; VI-NEXT:    v_fma_f32 v6, -v6, v12, v10
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v6, v6, v11, v12
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 11, v5
+; VI-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_30
+; VI-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 11, v8
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v9
+; VI-NEXT:  .LBB10_29: ; %frem.loop_body99
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v8, v7
+; VI-NEXT:    v_mul_f32_e32 v7, v8, v6
+; VI-NEXT:    v_rndne_f32_e32 v7, v7
+; VI-NEXT:    v_fma_f32 v7, -v7, v4, v8
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT:    v_add_f32_e32 v9, v7, v4
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT:    v_ldexp_f32 v7, v7, 11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v5
+; VI-NEXT:    s_cbranch_vccnz .LBB10_29
+; VI-NEXT:    s_branch .LBB10_31
+; VI-NEXT:  .LBB10_30:
+; VI-NEXT:    v_mov_b32_e32 v8, v7
+; VI-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -10, v5
+; VI-NEXT:    v_ldexp_f32 v5, v8, v5
+; VI-NEXT:    v_mul_f32_e32 v6, v5, v6
+; VI-NEXT:    v_rndne_f32_e32 v6, v6
+; VI-NEXT:    v_fma_f32 v5, -v6, v4, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v4, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT:    v_ldexp_f32 v3, v4, v3
 ; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; VI-NEXT:    v_div_fixup_f16 v3, v3, v4, s7
-; VI-NEXT:    v_trunc_f16_e32 v3, v3
-; VI-NEXT:    v_fma_f16 v3, -v3, v4, s7
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_and_b32 s0, s12, 0xffff8000
+; VI-NEXT:    v_xor_b32_e32 v3, s0, v3
+; VI-NEXT:  .LBB10_32: ; %Flow133
+; VI-NEXT:    v_mov_b32_e32 v5, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s10, 3
+; VI-NEXT:    v_mov_b32_e32 v4, 0x60
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s8, v5
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s6, v4
+; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s6, 3
+; VI-NEXT:    v_cmp_class_f16_e64 s[4:5], s4, v5
+; VI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT:    v_cmp_class_f16_e32 vcc, s10, v4
+; VI-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s11, v4
+; VI-NEXT:    v_cmp_class_f16_e64 s[10:11], s11, 3
+; VI-NEXT:    v_cmp_class_f16_e64 s[8:9], s9, v5
+; VI-NEXT:    s_xor_b64 s[10:11], s[10:11], -1
+; VI-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
+; VI-NEXT:    v_cmp_class_f16_e64 s[10:11], s14, v4
+; VI-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; VI-NEXT:    v_cmp_class_f16_e64 s[14:15], s14, 3
+; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cmp_class_f16_e64 s[12:13], s12, v5
+; VI-NEXT:    s_xor_b64 s[14:15], s[14:15], -1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; VI-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
+; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[10:11]
+; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[6:7]
+; VI-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[12:13]
+; VI-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[8:9]
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_mov_b32_e32 v2, s16
+; VI-NEXT:    v_mov_b32_e32 v3, s17
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -791,43 +2998,178 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s6, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; CI-NEXT:    v_div_scale_f32 v2, vcc, s2, v0, s2
-; CI-NEXT:    v_rcp_f32_e32 v3, v1
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    ; implicit-def: $vgpr0
+; CI-NEXT:    s_cbranch_vccz .LBB11_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s6, s2, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s6
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB11_2: ; %Flow56
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB11_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
+; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s4|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; CI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; CI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
-; CI-NEXT:    v_trunc_f32_e32 v1, v1
-; CI-NEXT:    v_fma_f32 v0, -v1, v0, s2
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB11_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB11_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB11_5
+; CI-NEXT:    s_branch .LBB11_7
+; CI-NEXT:  .LBB11_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB11_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    s_and_b32 s6, s2, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; CI-NEXT:  .LBB11_8: ; %Flow57
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, s3
-; CI-NEXT:    v_div_scale_f32 v3, vcc, s3, v1, s3
-; CI-NEXT:    v_rcp_f32_e32 v4, v2
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
+; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    ; implicit-def: $vgpr1
+; CI-NEXT:    s_cbranch_vccz .LBB11_10
+; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    s_and_b32 s6, s3, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v2, s5
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
+; CI-NEXT:    v_mov_b32_e32 v2, s6
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:  .LBB11_10: ; %Flow52
+; CI-NEXT:    s_xor_b32 s6, s6, 1
+; CI-NEXT:    s_and_b32 s6, s6, 1
+; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB11_16
+; CI-NEXT:  ; %bb.11: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s5|
+; CI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT:    v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s5|
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT:    v_ldexp_f32_e64 v5, v1, 12
+; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v9, v4
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; CI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; CI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; CI-NEXT:    v_fma_f32 v4, -v4, v10, v8
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; CI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB11_14
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 12, v6
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT:  .LBB11_13: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v7, v5, v2
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB11_13
+; CI-NEXT:    s_branch .LBB11_15
+; CI-NEXT:  .LBB11_14:
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_add_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT:    s_and_b32 s6, s3, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v1, s6, v1
+; CI-NEXT:  .LBB11_16: ; %Flow53
+; CI-NEXT:    v_mov_b32_e32 v2, 0x60
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v2
+; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[6:7], s4, 3
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v4
+; CI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; CI-NEXT:    s_and_b64 vcc, s[6:7], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v2
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v4
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s5, 3
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, s3
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v1, -v2, v1, s3
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -836,42 +3178,177 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s6, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; VI-NEXT:    v_div_scale_f32 v2, vcc, s2, v0, s2
-; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    s_cbranch_vccz .LBB11_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s6, s2, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:  .LBB11_2: ; %Flow56
+; VI-NEXT:    s_xor_b32 s6, s6, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB11_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s4|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 12
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
-; VI-NEXT:    v_trunc_f32_e32 v1, v1
-; VI-NEXT:    v_fma_f32 v0, -v1, v0, s2
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB11_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB11_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB11_5
+; VI-NEXT:    s_branch .LBB11_7
+; VI-NEXT:  .LBB11_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB11_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_and_b32 s6, s2, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; VI-NEXT:  .LBB11_8: ; %Flow57
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, s3
-; VI-NEXT:    v_div_scale_f32 v3, vcc, s3, v1, s3
-; VI-NEXT:    v_rcp_f32_e32 v4, v2
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
+; VI-NEXT:    s_mov_b32 s6, 1
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    s_cbranch_vccz .LBB11_10
+; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    s_and_b32 s6, s3, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:  .LBB11_10: ; %Flow52
+; VI-NEXT:    s_xor_b32 s6, s6, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB11_16
+; VI-NEXT:  ; %bb.11: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s5|
+; VI-NEXT:    v_ldexp_f32 v2, v2, 1
+; VI-NEXT:    v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s5|
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v5, v1, 12
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v9, v4
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; VI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; VI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; VI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; VI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; VI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; VI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; VI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; VI-NEXT:    v_fma_f32 v4, -v4, v10, v8
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT:    v_div_fixup_f32 v2, v2, v1, s3
-; VI-NEXT:    v_trunc_f32_e32 v2, v2
-; VI-NEXT:    v_fma_f32 v1, -v2, v1, s3
+; VI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB11_14
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 12, v6
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT:  .LBB11_13: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v7, v5, v2
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT:    v_ldexp_f32 v5, v5, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB11_13
+; VI-NEXT:    s_branch .LBB11_15
+; VI-NEXT:  .LBB11_14:
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v3, v6, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    v_ldexp_f32 v1, v2, v1
+; VI-NEXT:    s_and_b32 s6, s3, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v1, s6, v1
+; VI-NEXT:  .LBB11_16: ; %Flow53
+; VI-NEXT:    v_mov_b32_e32 v2, 0x60
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[6:7], s4, 3
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v4
+; VI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT:    s_and_b64 vcc, s[6:7], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v4
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s5, 3
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -892,73 +3369,340 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; CI-NEXT:    s_mov_b32 s2, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s8
-; CI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s4
-; CI-NEXT:    v_div_scale_f32 v2, vcc, s4, v0, s4
-; CI-NEXT:    v_rcp_f32_e32 v3, v1
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
+; CI-NEXT:    ; implicit-def: $vgpr0
+; CI-NEXT:    s_cbranch_vccz .LBB12_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_and_b32 s2, s4, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s8
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:  .LBB12_2: ; %Flow136
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB12_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s8|
+; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s4|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s4|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s8|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v8, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; CI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; CI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; CI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; CI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; CI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s4
-; CI-NEXT:    v_trunc_f32_e32 v1, v1
-; CI-NEXT:    v_fma_f32 v0, -v1, v0, s4
+; CI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT:  .LBB12_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v6, v4, v1
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT:    s_cbranch_vccnz .LBB12_5
+; CI-NEXT:    s_branch .LBB12_7
+; CI-NEXT:  .LBB12_6:
+; CI-NEXT:    v_mov_b32_e32 v5, v4
+; CI-NEXT:  .LBB12_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; CI-NEXT:    v_rndne_f32_e32 v3, v3
+; CI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_add_f32_e32 v1, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT:    s_and_b32 s2, s4, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; CI-NEXT:  .LBB12_8: ; %Flow137
 ; CI-NEXT:    v_mov_b32_e32 v1, s9
-; CI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, s5
-; CI-NEXT:    v_div_scale_f32 v3, vcc, s5, v1, s5
-; CI-NEXT:    v_rcp_f32_e32 v4, v2
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
+; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    ; implicit-def: $vgpr1
+; CI-NEXT:    s_cbranch_vccz .LBB12_10
+; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    s_and_b32 s2, s5, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v2, s9
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:  .LBB12_10: ; %Flow132
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB12_16
+; CI-NEXT:  ; %bb.11: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s9|
+; CI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT:    v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s5|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s9|
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT:    v_ldexp_f32_e64 v5, v1, 12
+; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v9, v4
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; CI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; CI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; CI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; CI-NEXT:    v_fma_f32 v4, -v4, v10, v8
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, s5
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v1, -v2, v1, s5
+; CI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_14
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 12, v6
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT:  .LBB12_13: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v7, v5, v2
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT:    s_cbranch_vccnz .LBB12_13
+; CI-NEXT:    s_branch .LBB12_15
+; CI-NEXT:  .LBB12_14:
+; CI-NEXT:    v_mov_b32_e32 v6, v5
+; CI-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_add_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT:    s_and_b32 s2, s5, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v1, s2, v1
+; CI-NEXT:  .LBB12_16: ; %Flow133
 ; CI-NEXT:    v_mov_b32_e32 v2, s10
-; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, s6
-; CI-NEXT:    v_div_scale_f32 v4, vcc, s6, v2, s6
-; CI-NEXT:    v_rcp_f32_e32 v5, v3
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
+; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    ; implicit-def: $vgpr2
+; CI-NEXT:    s_cbranch_vccz .LBB12_18
+; CI-NEXT:  ; %bb.17: ; %frem.else50
+; CI-NEXT:    s_and_b32 s2, s6, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v3, s10
+; CI-NEXT:    v_mov_b32_e32 v2, s6
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:  .LBB12_18: ; %Flow128
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB12_24
+; CI-NEXT:  ; %bb.19: ; %frem.compute49
+; CI-NEXT:    v_frexp_mant_f32_e64 v3, |s10|
+; CI-NEXT:    v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT:    v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s6|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s6|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s10|
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v7
+; CI-NEXT:    v_ldexp_f32_e64 v6, v2, 12
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; CI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v10, v5
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; CI-NEXT:    v_mul_f32_e32 v6, v4, v5
-; CI-NEXT:    v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; CI-NEXT:    v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT:    v_fma_f32 v11, -v5, v10, 1.0
+; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; CI-NEXT:    v_mul_f32_e32 v11, v9, v10
+; CI-NEXT:    v_fma_f32 v12, -v5, v11, v9
+; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; CI-NEXT:    v_fma_f32 v5, -v5, v11, v9
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, s6
-; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v2, -v3, v2, s6
+; CI-NEXT:    v_div_fmas_f32 v5, v5, v10, v11
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v4
+; CI-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_22
+; CI-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 12, v7
+; CI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
+; CI-NEXT:  .LBB12_21: ; %frem.loop_body57
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v6
+; CI-NEXT:    v_mul_f32_e32 v6, v7, v5
+; CI-NEXT:    v_rndne_f32_e32 v6, v6
+; CI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT:    v_add_f32_e32 v8, v6, v3
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -12, v4
+; CI-NEXT:    v_ldexp_f32_e64 v6, v6, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; CI-NEXT:    s_cbranch_vccnz .LBB12_21
+; CI-NEXT:    s_branch .LBB12_23
+; CI-NEXT:  .LBB12_22:
+; CI-NEXT:    v_mov_b32_e32 v7, v6
+; CI-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT:    v_ldexp_f32_e32 v4, v7, v4
+; CI-NEXT:    v_mul_f32_e32 v5, v4, v5
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT:    s_and_b32 s2, s6, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v2, s2, v2
+; CI-NEXT:  .LBB12_24: ; %Flow129
 ; CI-NEXT:    v_mov_b32_e32 v3, s11
-; CI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, s7
-; CI-NEXT:    v_div_scale_f32 v5, vcc, s7, v3, s7
-; CI-NEXT:    v_rcp_f32_e32 v6, v4
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
+; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    ; implicit-def: $vgpr3
+; CI-NEXT:    s_cbranch_vccz .LBB12_26
+; CI-NEXT:  ; %bb.25: ; %frem.else84
+; CI-NEXT:    s_and_b32 s2, s7, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v4, s11
+; CI-NEXT:    v_mov_b32_e32 v3, s7
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
+; CI-NEXT:    v_mov_b32_e32 v4, s2
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:  .LBB12_26: ; %Flow124
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB12_32
+; CI-NEXT:  ; %bb.27: ; %frem.compute83
+; CI-NEXT:    v_frexp_mant_f32_e64 v4, |s11|
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 1
+; CI-NEXT:    v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s7|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v9, |s11|
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -1, v8
+; CI-NEXT:    v_ldexp_f32_e64 v7, v3, 12
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v9
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
+; CI-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v11, v6
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v7, -v4, v6, 1.0
-; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; CI-NEXT:    v_mul_f32_e32 v7, v5, v6
-; CI-NEXT:    v_fma_f32 v8, -v4, v7, v5
-; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; CI-NEXT:    v_fma_f32 v4, -v4, v7, v5
+; CI-NEXT:    v_fma_f32 v12, -v6, v11, 1.0
+; CI-NEXT:    v_fma_f32 v11, v12, v11, v11
+; CI-NEXT:    v_mul_f32_e32 v12, v10, v11
+; CI-NEXT:    v_fma_f32 v13, -v6, v12, v10
+; CI-NEXT:    v_fma_f32 v12, v13, v11, v12
+; CI-NEXT:    v_fma_f32 v6, -v6, v12, v10
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
+; CI-NEXT:    v_div_fmas_f32 v6, v6, v11, v12
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v5
+; CI-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_30
+; CI-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 12, v8
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
+; CI-NEXT:  .LBB12_29: ; %frem.loop_body91
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v8, v7
+; CI-NEXT:    v_mul_f32_e32 v7, v8, v6
+; CI-NEXT:    v_rndne_f32_e32 v7, v7
+; CI-NEXT:    v_fma_f32 v7, -v7, v4, v8
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT:    v_add_f32_e32 v9, v7, v4
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -12, v5
+; CI-NEXT:    v_ldexp_f32_e64 v7, v7, 12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v5
+; CI-NEXT:    s_cbranch_vccnz .LBB12_29
+; CI-NEXT:    s_branch .LBB12_31
+; CI-NEXT:  .LBB12_30:
+; CI-NEXT:    v_mov_b32_e32 v8, v7
+; CI-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT:    v_ldexp_f32_e32 v5, v8, v5
+; CI-NEXT:    v_mul_f32_e32 v6, v5, v6
+; CI-NEXT:    v_rndne_f32_e32 v6, v6
+; CI-NEXT:    v_fma_f32 v5, -v6, v4, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v4, v5, v4
+; CI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT:    s_and_b32 s2, s7, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v3, s2, v3
+; CI-NEXT:  .LBB12_32: ; %Flow125
+; CI-NEXT:    v_mov_b32_e32 v4, 0x60
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s8, v4
+; CI-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v6, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s8, 3
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v6
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s9, v4
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s9, 3
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v6
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s10, v4
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s10, 3
+; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s6, v6
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s11, v4
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s11, 3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s7, v6
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, s7
-; CI-NEXT:    v_trunc_f32_e32 v4, v4
-; CI-NEXT:    v_fma_f32 v3, -v4, v3, s7
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -969,71 +3713,338 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x40
+; VI-NEXT:    s_mov_b32 s2, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_div_scale_f32 v1, s[2:3], v0, v0, s4
-; VI-NEXT:    v_div_scale_f32 v2, vcc, s4, v0, s4
-; VI-NEXT:    v_rcp_f32_e32 v3, v1
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
+; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    s_cbranch_vccz .LBB12_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_and_b32 s2, s4, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:  .LBB12_2: ; %Flow136
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB12_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s8|
+; VI-NEXT:    v_ldexp_f32 v1, v1, 1
+; VI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s4|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s4|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s8|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT:    v_ldexp_f32 v4, v0, 12
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v8, v3
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT:    v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; VI-NEXT:    v_mul_f32_e32 v9, v7, v8
+; VI-NEXT:    v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; VI-NEXT:    v_fma_f32 v3, -v3, v9, v7
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s4
-; VI-NEXT:    v_trunc_f32_e32 v1, v1
-; VI-NEXT:    v_fma_f32 v0, -v1, v0, s4
+; VI-NEXT:    v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT:  .LBB12_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v6, v4, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT:    v_ldexp_f32 v4, v4, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT:    s_cbranch_vccnz .LBB12_5
+; VI-NEXT:    s_branch .LBB12_7
+; VI-NEXT:  .LBB12_6:
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:  .LBB12_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; VI-NEXT:    v_rndne_f32_e32 v3, v3
+; VI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_add_f32_e32 v1, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_and_b32 s2, s4, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; VI-NEXT:  .LBB12_8: ; %Flow137
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, s5
-; VI-NEXT:    v_div_scale_f32 v3, vcc, s5, v1, s5
-; VI-NEXT:    v_rcp_f32_e32 v4, v2
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
+; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    ; implicit-def: $vgpr1
+; VI-NEXT:    s_cbranch_vccz .LBB12_10
+; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    s_and_b32 s2, s5, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:  .LBB12_10: ; %Flow132
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB12_16
+; VI-NEXT:  ; %bb.11: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s9|
+; VI-NEXT:    v_ldexp_f32 v2, v2, 1
+; VI-NEXT:    v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s5|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s9|
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT:    v_ldexp_f32 v5, v1, 12
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v9, v4
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; VI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; VI-NEXT:    v_mul_f32_e32 v5, v3, v4
-; VI-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; VI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; VI-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; VI-NEXT:    v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v8, v9
+; VI-NEXT:    v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; VI-NEXT:    v_fma_f32 v4, -v4, v10, v8
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT:    v_div_fixup_f32 v2, v2, v1, s5
-; VI-NEXT:    v_trunc_f32_e32 v2, v2
-; VI-NEXT:    v_fma_f32 v1, -v2, v1, s5
+; VI-NEXT:    v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_14
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 12, v6
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT:  .LBB12_13: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v6, v4
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v7, v5, v2
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT:    v_ldexp_f32 v5, v5, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT:    s_cbranch_vccnz .LBB12_13
+; VI-NEXT:    s_branch .LBB12_15
+; VI-NEXT:  .LBB12_14:
+; VI-NEXT:    v_mov_b32_e32 v6, v5
+; VI-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v3, v6, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    v_ldexp_f32 v1, v2, v1
+; VI-NEXT:    s_and_b32 s2, s5, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v1, s2, v1
+; VI-NEXT:  .LBB12_16: ; %Flow133
 ; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, s6
-; VI-NEXT:    v_div_scale_f32 v4, vcc, s6, v2, s6
-; VI-NEXT:    v_rcp_f32_e32 v5, v3
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
+; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:    s_cbranch_vccz .LBB12_18
+; VI-NEXT:  ; %bb.17: ; %frem.else50
+; VI-NEXT:    s_and_b32 s2, s6, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v3, s10
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
+; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:  .LBB12_18: ; %Flow128
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB12_24
+; VI-NEXT:  ; %bb.19: ; %frem.compute49
+; VI-NEXT:    v_frexp_mant_f32_e64 v3, |s10|
+; VI-NEXT:    v_ldexp_f32 v3, v3, 1
+; VI-NEXT:    v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s6|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s6|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s10|
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v7
+; VI-NEXT:    v_ldexp_f32 v6, v2, 12
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v2
+; VI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v10, v5
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v6, -v3, v5, 1.0
-; VI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; VI-NEXT:    v_mul_f32_e32 v6, v4, v5
-; VI-NEXT:    v_fma_f32 v7, -v3, v6, v4
-; VI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; VI-NEXT:    v_fma_f32 v3, -v3, v6, v4
+; VI-NEXT:    v_fma_f32 v11, -v5, v10, 1.0
+; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v9, v10
+; VI-NEXT:    v_fma_f32 v12, -v5, v11, v9
+; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; VI-NEXT:    v_fma_f32 v5, -v5, v11, v9
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, s6
-; VI-NEXT:    v_trunc_f32_e32 v3, v3
-; VI-NEXT:    v_fma_f32 v2, -v3, v2, s6
+; VI-NEXT:    v_div_fmas_f32 v5, v5, v10, v11
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v4
+; VI-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_22
+; VI-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 12, v7
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v8
+; VI-NEXT:  .LBB12_21: ; %frem.loop_body57
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:    v_mul_f32_e32 v6, v7, v5
+; VI-NEXT:    v_rndne_f32_e32 v6, v6
+; VI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT:    v_add_f32_e32 v8, v6, v3
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -12, v4
+; VI-NEXT:    v_ldexp_f32 v6, v6, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; VI-NEXT:    s_cbranch_vccnz .LBB12_21
+; VI-NEXT:    s_branch .LBB12_23
+; VI-NEXT:  .LBB12_22:
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT:    v_ldexp_f32 v4, v7, v4
+; VI-NEXT:    v_mul_f32_e32 v5, v4, v5
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
+; VI-NEXT:    s_and_b32 s2, s6, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v2, s2, v2
+; VI-NEXT:  .LBB12_24: ; %Flow129
 ; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, s7
-; VI-NEXT:    v_div_scale_f32 v5, vcc, s7, v3, s7
-; VI-NEXT:    v_rcp_f32_e32 v6, v4
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
+; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    s_cbranch_vccz .LBB12_26
+; VI-NEXT:  ; %bb.25: ; %frem.else84
+; VI-NEXT:    s_and_b32 s2, s7, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v4, s11
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:  .LBB12_26: ; %Flow124
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB12_32
+; VI-NEXT:  ; %bb.27: ; %frem.compute83
+; VI-NEXT:    v_frexp_mant_f32_e64 v4, |s11|
+; VI-NEXT:    v_ldexp_f32 v4, v4, 1
+; VI-NEXT:    v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s7|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v9, |s11|
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -1, v8
+; VI-NEXT:    v_ldexp_f32 v7, v3, 12
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v9
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v3
+; VI-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v11, v6
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v7, -v4, v6, 1.0
-; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; VI-NEXT:    v_mul_f32_e32 v7, v5, v6
-; VI-NEXT:    v_fma_f32 v8, -v4, v7, v5
-; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; VI-NEXT:    v_fma_f32 v4, -v4, v7, v5
+; VI-NEXT:    v_fma_f32 v12, -v6, v11, 1.0
+; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
+; VI-NEXT:    v_mul_f32_e32 v12, v10, v11
+; VI-NEXT:    v_fma_f32 v13, -v6, v12, v10
+; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
+; VI-NEXT:    v_fma_f32 v6, -v6, v12, v10
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
-; VI-NEXT:    v_div_fixup_f32 v4, v4, v3, s7
-; VI-NEXT:    v_trunc_f32_e32 v4, v4
-; VI-NEXT:    v_fma_f32 v3, -v4, v3, s7
+; VI-NEXT:    v_div_fmas_f32 v6, v6, v11, v12
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 12, v5
+; VI-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_30
+; VI-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 12, v8
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v9
+; VI-NEXT:  .LBB12_29: ; %frem.loop_body91
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v8, v7
+; VI-NEXT:    v_mul_f32_e32 v7, v8, v6
+; VI-NEXT:    v_rndne_f32_e32 v7, v7
+; VI-NEXT:    v_fma_f32 v7, -v7, v4, v8
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT:    v_add_f32_e32 v9, v7, v4
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -12, v5
+; VI-NEXT:    v_ldexp_f32 v7, v7, 12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v5
+; VI-NEXT:    s_cbranch_vccnz .LBB12_29
+; VI-NEXT:    s_branch .LBB12_31
+; VI-NEXT:  .LBB12_30:
+; VI-NEXT:    v_mov_b32_e32 v8, v7
+; VI-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT:    v_ldexp_f32 v5, v8, v5
+; VI-NEXT:    v_mul_f32_e32 v6, v5, v6
+; VI-NEXT:    v_rndne_f32_e32 v6, v6
+; VI-NEXT:    v_fma_f32 v5, -v6, v4, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v4, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT:    v_ldexp_f32 v3, v4, v3
+; VI-NEXT:    s_and_b32 s2, s7, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v3, s2, v3
+; VI-NEXT:  .LBB12_32: ; %Flow125
+; VI-NEXT:    v_mov_b32_e32 v4, 0x60
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s8, v4
+; VI-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; VI-NEXT:    v_mov_b32_e32 v6, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s8, 3
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v6
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s9, v4
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s9, 3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v6
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s10, v4
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s10, 3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s6, v6
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s11, v4
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s11, 3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s7, v6
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1054,39 +4065,204 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
+; CI-NEXT:    s_mov_b32 s2, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s8
 ; CI-NEXT:    v_mov_b32_e32 v1, s9
-; CI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
-; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
-; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
-; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT:    s_cbranch_vccz .LBB13_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_mov_b32_e32 v0, s8
+; CI-NEXT:    v_mov_b32_e32 v1, s9
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_brev_b32 s3, 1
+; CI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s4
+; CI-NEXT:    v_mov_b32_e32 v3, s5
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:  .LBB13_2: ; %Flow56
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB13_8
+; CI-NEXT:  ; %bb.3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; CI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB13_6
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT:  .LBB13_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT:    s_cbranch_vccnz .LBB13_5
+; CI-NEXT:    s_branch .LBB13_7
+; CI-NEXT:  .LBB13_6:
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:  .LBB13_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_brev_b32 s3, 1
+; CI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s3, v1
+; CI-NEXT:  .LBB13_8: ; %Flow57
 ; CI-NEXT:    v_mov_b32_e32 v2, s10
 ; CI-NEXT:    v_mov_b32_e32 v3, s11
-; CI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
-; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CI-NEXT:    s_cbranch_vccz .LBB13_10
+; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    v_mov_b32_e32 v2, s10
+; CI-NEXT:    v_mov_b32_e32 v3, s11
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_brev_b32 s3, 1
+; CI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_mov_b32_e32 v4, s6
+; CI-NEXT:    v_mov_b32_e32 v5, s7
+; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:  .LBB13_10: ; %Flow52
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_cbranch_scc1 .LBB13_16
+; CI-NEXT:  ; %bb.11: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[2:3], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v8
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -1, v9
+; CI-NEXT:    v_sub_i32_e32 v11, vcc, v4, v10
+; CI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 1
+; CI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; CI-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[12:13], v[4:5]
+; CI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; CI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; CI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; CI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; CI-NEXT:    v_mul_f64 v[14:15], v[16:17], v[12:13]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15]
+; CI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v11
+; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB13_14
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 26, v8
+; CI-NEXT:    v_sub_i32_e32 v11, vcc, v8, v9
+; CI-NEXT:  .LBB13_13: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v9, v7
+; CI-NEXT:    v_mov_b32_e32 v8, v6
+; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[12:13], v[6:7], v[2:3]
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 0xffffffe6, v11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v11
+; CI-NEXT:    s_cbranch_vccnz .LBB13_13
+; CI-NEXT:    s_branch .LBB13_15
+; CI-NEXT:  .LBB13_14:
+; CI-NEXT:    v_mov_b32_e32 v9, v7
+; CI-NEXT:    v_mov_b32_e32 v8, v6
+; CI-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffe7, v11
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[8:9], v6
+; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_brev_b32 s3, 1
+; CI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[4:5]
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
+; CI-NEXT:    v_xor_b32_e32 v2, s2, v2
+; CI-NEXT:    v_xor_b32_e32 v3, s3, v3
+; CI-NEXT:  .LBB13_16: ; %Flow53
+; CI-NEXT:    v_mov_b32_e32 v4, 0x60
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[8:9], v4
+; CI-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; CI-NEXT:    v_mov_b32_e32 v6, 0x1f8
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[8:9], 3
+; CI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v6
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[10:11], v4
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[10:11], 3
+; CI-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v6
+; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
-; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
-; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; CI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -1097,38 +4273,203 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x40
+; VI-NEXT:    s_mov_b32 s2, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
-; VI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
-; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
-; VI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT:    s_cbranch_vccz .LBB13_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_brev_b32 s3, 1
+; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:  .LBB13_2: ; %Flow56
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB13_8
+; VI-NEXT:  ; %bb.3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB13_6
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT:  .LBB13_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT:    s_cbranch_vccnz .LBB13_5
+; VI-NEXT:    s_branch .LBB13_7
+; VI-NEXT:  .LBB13_6:
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:  .LBB13_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_brev_b32 s3, 1
+; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s3, v1
+; VI-NEXT:  .LBB13_8: ; %Flow57
 ; VI-NEXT:    v_mov_b32_e32 v2, s10
 ; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
-; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
-; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; VI-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
-; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
-; VI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; VI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
-; VI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; VI-NEXT:    s_cbranch_vccz .LBB13_10
+; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_brev_b32 s3, 1
+; VI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v4, s6
+; VI-NEXT:    v_mov_b32_e32 v5, s7
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:  .LBB13_10: ; %Flow52
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cbranch_scc1 .LBB13_16
+; VI-NEXT:  ; %bb.11: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[2:3], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v8
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -1, v9
+; VI-NEXT:    v_sub_u32_e32 v11, vcc, v4, v10
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 1
+; VI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; VI-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[4:5]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; VI-NEXT:    v_mul_f64 v[14:15], v[16:17], v[12:13]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; VI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15]
+; VI-NEXT:    v_cmp_ge_i32_e32 vcc, 26, v11
+; VI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB13_14
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_add_u32_e32 v8, vcc, 26, v8
+; VI-NEXT:    v_sub_u32_e32 v11, vcc, v8, v9
+; VI-NEXT:  .LBB13_13: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    v_mov_b32_e32 v8, v6
+; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[4:5]
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[12:13], v[6:7], v[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 0xffffffe6, v11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v11
+; VI-NEXT:    s_cbranch_vccnz .LBB13_13
+; VI-NEXT:    s_branch .LBB13_15
+; VI-NEXT:  .LBB13_14:
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    v_mov_b32_e32 v8, v6
+; VI-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 0xffffffe7, v11
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[8:9], v6
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_brev_b32 s3, 1
+; VI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
+; VI-NEXT:    v_xor_b32_e32 v2, s2, v2
+; VI-NEXT:    v_xor_b32_e32 v3, s3, v3
+; VI-NEXT:  .LBB13_16: ; %Flow53
+; VI-NEXT:    v_mov_b32_e32 v4, 0x60
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[8:9], v4
+; VI-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; VI-NEXT:    v_mov_b32_e32 v6, 0x1f8
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[8:9], 3
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v6
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[10:11], v4
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[10:11], 3
+; VI-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v6
+; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4962254545c3c..0d1ed8068b9b5 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2570,21 +2570,86 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x28
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v1, s1, s0, s0, v0
-; GFX1032-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, s0, v0
+; GFX1032-NEXT:    v_cmp_ngt_f32_e64 s1, v0, |s0|
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX1032-NEXT:  ; %bb.1: ; %frem.else
+; GFX1032-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1032-NEXT:  ; %bb.2: ; %Flow13
+; GFX1032-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032-NEXT:    s_cbranch_execz .LBB51_8
+; GFX1032-NEXT:  ; %bb.3: ; %frem.compute
+; GFX1032-NEXT:    v_frexp_mant_f32_e64 v1, |s0|
+; GFX1032-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1032-NEXT:    v_frexp_mant_f32_e32 v8, v0
+; GFX1032-NEXT:    v_ldexp_f32 v1, v1, 1
+; GFX1032-NEXT:    v_div_scale_f32 v2, s2, v1, v1, 1.0
+; GFX1032-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0
+; GFX1032-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1032-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX1032-NEXT:    v_mul_f32_e32 v4, v5, v3
+; GFX1032-NEXT:    v_fma_f32 v6, -v2, v4, v5
+; GFX1032-NEXT:    v_fmac_f32_e32 v4, v6, v3
+; GFX1032-NEXT:    v_frexp_exp_i32_f32_e32 v6, s0
+; GFX1032-NEXT:    v_fma_f32 v5, -v2, v4, v5
+; GFX1032-NEXT:    v_add_nc_u32_e32 v2, -1, v6
+; GFX1032-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
+; GFX1032-NEXT:    v_xad_u32 v4, v2, -1, v7
+; GFX1032-NEXT:    v_ldexp_f32 v5, v8, 12
+; GFX1032-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 12, v4
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB51_7
+; GFX1032-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1032-NEXT:    v_sub_nc_u32_e32 v4, v7, v6
+; GFX1032-NEXT:    s_mov_b32 s3, 0
+; GFX1032-NEXT:    v_add_nc_u32_e32 v4, 12, v4
+; GFX1032-NEXT:  .LBB51_5: ; %frem.loop_body
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_mov_b32_e32 v6, v5
+; GFX1032-NEXT:    v_add_nc_u32_e32 v4, -12, v4
+; GFX1032-NEXT:    v_mul_f32_e32 v5, v6, v3
+; GFX1032-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1032-NEXT:    v_fma_f32 v5, -v5, v1, v6
+; GFX1032-NEXT:    v_add_f32_e32 v7, v5, v1
+; GFX1032-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1032-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1032-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v4
+; GFX1032-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1032-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execnz .LBB51_5
+; GFX1032-NEXT:  ; %bb.6: ; %Flow
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1032-NEXT:  .LBB51_7: ; %Flow12
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1032-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1032-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX1032-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1032-NEXT:    v_fma_f32 v3, -v3, v1, v4
+; GFX1032-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX1032-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1032-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1032-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
+; GFX1032-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1032-NEXT:  .LBB51_8: ; %Flow14
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s1, s0, 3
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s0, s0, 0x60
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s2, v0, 0x1f8
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s0
+; GFX1032-NEXT:    s_xor_b32 s0, s1, -1
 ; GFX1032-NEXT:    s_brev_b32 s1, 1
-; GFX1032-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX1032-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX1032-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX1032-NEXT:    v_mul_f32_e32 v3, v4, v2
-; GFX1032-NEXT:    v_fma_f32 v5, -v1, v3, v4
-; GFX1032-NEXT:    v_fmac_f32_e32 v3, v5, v2
-; GFX1032-NEXT:    v_fma_f32 v1, -v1, v3, v4
-; GFX1032-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
-; GFX1032-NEXT:    v_div_fixup_f32 v1, v1, s0, v0
-; GFX1032-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1032-NEXT:    v_fma_f32 v0, -v1, s0, v0
+; GFX1032-NEXT:    s_and_b32 vcc_lo, s0, s2
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
 ; GFX1032-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
 ; GFX1032-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 0, v0
@@ -2593,29 +2658,94 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
-; GFX1032-NEXT:  ; %bb.1: ; %if.then
+; GFX1032-NEXT:  ; %bb.9: ; %if.then
 ; GFX1032-NEXT:    ; divergent unreachable
-; GFX1032-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX1032-NEXT:  ; %bb.10: ; %UnifiedReturnBlock
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fcmp64:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x28
+; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x28
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v1, s[0:1], s2, s2, v0
-; GFX1064-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX1064-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX1064-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX1064-NEXT:    v_div_scale_f32 v3, vcc, v0, s2, v0
-; GFX1064-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX1064-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX1064-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX1064-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX1064-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX1064-NEXT:    v_div_fixup_f32 v1, v1, s2, v0
-; GFX1064-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1064-NEXT:    v_fma_f32 v0, -v1, s2, v0
+; GFX1064-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, |s6|
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[2:3]
+; GFX1064-NEXT:  ; %bb.1: ; %frem.else
+; GFX1064-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 vcc, v0, |s6|
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX1064-NEXT:  ; %bb.2: ; %Flow13
+; GFX1064-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB51_8
+; GFX1064-NEXT:  ; %bb.3: ; %frem.compute
+; GFX1064-NEXT:    v_frexp_mant_f32_e64 v1, |s6|
+; GFX1064-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1064-NEXT:    v_frexp_mant_f32_e32 v8, v0
+; GFX1064-NEXT:    v_ldexp_f32 v1, v1, 1
+; GFX1064-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, 1.0
+; GFX1064-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v1, 1.0
+; GFX1064-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1064-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX1064-NEXT:    v_mul_f32_e32 v4, v5, v3
+; GFX1064-NEXT:    v_fma_f32 v6, -v2, v4, v5
+; GFX1064-NEXT:    v_fmac_f32_e32 v4, v6, v3
+; GFX1064-NEXT:    v_frexp_exp_i32_f32_e32 v6, s6
+; GFX1064-NEXT:    v_fma_f32 v5, -v2, v4, v5
+; GFX1064-NEXT:    v_add_nc_u32_e32 v2, -1, v6
+; GFX1064-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
+; GFX1064-NEXT:    v_xad_u32 v4, v2, -1, v7
+; GFX1064-NEXT:    v_ldexp_f32 v5, v8, 12
+; GFX1064-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB51_7
+; GFX1064-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v4, v7, v6
+; GFX1064-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1064-NEXT:    v_add_nc_u32_e32 v4, 12, v4
+; GFX1064-NEXT:  .LBB51_5: ; %frem.loop_body
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_mov_b32_e32 v6, v5
+; GFX1064-NEXT:    v_add_nc_u32_e32 v4, -12, v4
+; GFX1064-NEXT:    v_mul_f32_e32 v5, v6, v3
+; GFX1064-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1064-NEXT:    v_fma_f32 v5, -v5, v1, v6
+; GFX1064-NEXT:    v_add_f32_e32 v7, v5, v1
+; GFX1064-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX1064-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX1064-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v4
+; GFX1064-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1064-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB51_5
+; GFX1064-NEXT:  ; %bb.6: ; %Flow
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1064-NEXT:  .LBB51_7: ; %Flow12
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1064-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1064-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX1064-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1064-NEXT:    v_fma_f32 v3, -v3, v1, v4
+; GFX1064-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX1064-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX1064-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1064-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
+; GFX1064-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1064-NEXT:  .LBB51_8: ; %Flow14
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[4:5], s6, 0x60
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[0:1], s6, 3
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, 0x1f8
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s[4:5]
+; GFX1064-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX1064-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc
 ; GFX1064-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_lshr_b64 s[0:1], vcc, 1
 ; GFX1064-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v0
@@ -2625,9 +2755,9 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX1064-NEXT:  ; %bb.1: ; %if.then
+; GFX1064-NEXT:  ; %bb.9: ; %if.then
 ; GFX1064-NEXT:    ; divergent unreachable
-; GFX1064-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX1064-NEXT:  ; %bb.10: ; %UnifiedReturnBlock
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -2756,20 +2886,85 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dword s0, s[4:5], 0x28
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_div_scale_f32 v1, s1, s0, s0, v0
-; GFX1032-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX1032-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX1032-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX1032-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, s0, v0
-; GFX1032-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX1032-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX1032-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX1032-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX1032-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX1032-NEXT:    v_div_fixup_f32 v1, v1, s0, v0
-; GFX1032-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1032-NEXT:    v_fma_f32 v0, -v1, s0, v0
+; GFX1032-NEXT:    v_cmp_ngt_f32_e64 s1, v0, |s0|
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX1032-NEXT:  ; %bb.1: ; %frem.else
+; GFX1032-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1032-NEXT:    v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1032-NEXT:  ; %bb.2: ; %Flow13
+; GFX1032-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX1032-NEXT:    s_cbranch_execz .LBB53_8
+; GFX1032-NEXT:  ; %bb.3: ; %frem.compute
+; GFX1032-NEXT:    v_frexp_mant_f32_e64 v1, |s0|
+; GFX1032-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1032-NEXT:    v_frexp_mant_f32_e32 v8, v0
+; GFX1032-NEXT:    v_ldexp_f32 v1, v1, 1
+; GFX1032-NEXT:    v_div_scale_f32 v2, s2, v1, v1, 1.0
+; GFX1032-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0
+; GFX1032-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1032-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX1032-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX1032-NEXT:    v_mul_f32_e32 v4, v5, v3
+; GFX1032-NEXT:    v_fma_f32 v6, -v2, v4, v5
+; GFX1032-NEXT:    v_fmac_f32_e32 v4, v6, v3
+; GFX1032-NEXT:    v_frexp_exp_i32_f32_e32 v6, s0
+; GFX1032-NEXT:    v_fma_f32 v5, -v2, v4, v5
+; GFX1032-NEXT:    v_add_nc_u32_e32 v2, -1, v6
+; GFX1032-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
+; GFX1032-NEXT:    v_xad_u32 v4, v2, -1, v7
+; GFX1032-NEXT:    v_ldexp_f32 v5, v8, 12
+; GFX1032-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1032-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 12, v4
+; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT:    s_cbranch_execz .LBB53_7
+; GFX1032-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1032-NEXT:    v_sub_nc_u32_e32 v4, v7, v6
+; GFX1032-NEXT:    s_mov_b32 s3, 0
+; GFX1032-NEXT:    v_add_nc_u32_e32 v4, 12, v4
+; GFX1032-NEXT:  .LBB53_5: ; %frem.loop_body
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    v_mov_b32_e32 v6, v5
+; GFX1032-NEXT:    v_add_nc_u32_e32 v4, -12, v4
+; GFX1032-NEXT:    v_mul_f32_e32 v5, v6, v3
+; GFX1032-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1032-NEXT:    v_fma_f32 v5, -v5, v1, v6
+; GFX1032-NEXT:    v_add_f32_e32 v7, v5, v1
+; GFX1032-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1032-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1032-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v4
+; GFX1032-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1032-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX1032-NEXT:    s_andn2_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execnz .LBB53_5
+; GFX1032-NEXT:  ; %bb.6: ; %Flow
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1032-NEXT:  .LBB53_7: ; %Flow12
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1032-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1032-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX1032-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1032-NEXT:    v_fma_f32 v3, -v3, v1, v4
+; GFX1032-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX1032-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1032-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1032-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
+; GFX1032-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1032-NEXT:  .LBB53_8: ; %Flow14
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s1, s0, 3
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s0, s0, 0x60
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s2, v0, 0x1f8
+; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s0
+; GFX1032-NEXT:    s_xor_b32 s0, s1, -1
+; GFX1032-NEXT:    s_and_b32 vcc_lo, s0, s2
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
 ; GFX1032-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
 ; GFX1032-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 0, v0
@@ -2779,29 +2974,94 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1032-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1032-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, s0
-; GFX1032-NEXT:  ; %bb.1: ; %if.then
+; GFX1032-NEXT:  ; %bb.9: ; %if.then
 ; GFX1032-NEXT:    ; divergent unreachable
-; GFX1032-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX1032-NEXT:  ; %bb.10: ; %UnifiedReturnBlock
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: fcmp32:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_load_dword s2, s[4:5], 0x28
+; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x28
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_div_scale_f32 v1, s[0:1], s2, s2, v0
-; GFX1064-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX1064-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX1064-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX1064-NEXT:    v_div_scale_f32 v3, vcc, v0, s2, v0
-; GFX1064-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX1064-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX1064-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX1064-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX1064-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX1064-NEXT:    v_div_fixup_f32 v1, v1, s2, v0
-; GFX1064-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX1064-NEXT:    v_fma_f32 v0, -v1, s2, v0
+; GFX1064-NEXT:    v_cmp_ngt_f32_e64 s[0:1], v0, |s6|
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[2:3]
+; GFX1064-NEXT:  ; %bb.1: ; %frem.else
+; GFX1064-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1064-NEXT:    v_cmp_eq_f32_e64 vcc, v0, |s6|
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX1064-NEXT:  ; %bb.2: ; %Flow13
+; GFX1064-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB53_8
+; GFX1064-NEXT:  ; %bb.3: ; %frem.compute
+; GFX1064-NEXT:    v_frexp_mant_f32_e64 v1, |s6|
+; GFX1064-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1064-NEXT:    v_frexp_mant_f32_e32 v8, v0
+; GFX1064-NEXT:    v_ldexp_f32 v1, v1, 1
+; GFX1064-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, 1.0
+; GFX1064-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v1, 1.0
+; GFX1064-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1064-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX1064-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX1064-NEXT:    v_mul_f32_e32 v4, v5, v3
+; GFX1064-NEXT:    v_fma_f32 v6, -v2, v4, v5
+; GFX1064-NEXT:    v_fmac_f32_e32 v4, v6, v3
+; GFX1064-NEXT:    v_frexp_exp_i32_f32_e32 v6, s6
+; GFX1064-NEXT:    v_fma_f32 v5, -v2, v4, v5
+; GFX1064-NEXT:    v_add_nc_u32_e32 v2, -1, v6
+; GFX1064-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
+; GFX1064-NEXT:    v_xad_u32 v4, v2, -1, v7
+; GFX1064-NEXT:    v_ldexp_f32 v5, v8, 12
+; GFX1064-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1064-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_cbranch_execz .LBB53_7
+; GFX1064-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v4, v7, v6
+; GFX1064-NEXT:    s_mov_b64 s[4:5], 0
+; GFX1064-NEXT:    v_add_nc_u32_e32 v4, 12, v4
+; GFX1064-NEXT:  .LBB53_5: ; %frem.loop_body
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    v_mov_b32_e32 v6, v5
+; GFX1064-NEXT:    v_add_nc_u32_e32 v4, -12, v4
+; GFX1064-NEXT:    v_mul_f32_e32 v5, v6, v3
+; GFX1064-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1064-NEXT:    v_fma_f32 v5, -v5, v1, v6
+; GFX1064-NEXT:    v_add_f32_e32 v7, v5, v1
+; GFX1064-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX1064-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX1064-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v4
+; GFX1064-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1064-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX1064-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_cbranch_execnz .LBB53_5
+; GFX1064-NEXT:  ; %bb.6: ; %Flow
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1064-NEXT:  .LBB53_7: ; %Flow12
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1064-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1064-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX1064-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1064-NEXT:    v_fma_f32 v3, -v3, v1, v4
+; GFX1064-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX1064-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX1064-NEXT:    v_ldexp_f32 v1, v1, v2
+; GFX1064-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
+; GFX1064-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1064-NEXT:  .LBB53_8: ; %Flow14
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[4:5], s6, 0x60
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[0:1], s6, 3
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, 0x1f8
+; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s[4:5]
+; GFX1064-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX1064-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc
 ; GFX1064-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_lshr_b32 s0, vcc_lo, 1
 ; GFX1064-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v0
@@ -2811,9 +3071,9 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1064-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX1064-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX1064-NEXT:  ; %bb.1: ; %if.then
+; GFX1064-NEXT:  ; %bb.9: ; %if.then
 ; GFX1064-NEXT:    ; divergent unreachable
-; GFX1064-NEXT:  ; %bb.2: ; %UnifiedReturnBlock
+; GFX1064-NEXT:  ; %bb.10: ; %UnifiedReturnBlock
 ; GFX1064-NEXT:    s_endpgm
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()

>From 193957c97f75009fe5801f0291f67ea592785c64 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 12 Mar 2025 12:35:42 -0400
Subject: [PATCH 02/19] Adjust some comments, remove include

---
 llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 31d3779eb7c9f..0bef9abc0eac3 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/FMF.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/PassManager.h"
@@ -138,9 +137,8 @@ class FRemExpander {
                                                const Twine &PowName) const {
     // Build:
     //   ExName = BUILTIN_FREXP_EXP_ComputeFpTy(Src) - 1;
-    //   PowName =
-    //   BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ExName),
-    //   NewExp);
+    //   PowName = BUILTIN_FLDEXP_ComputeFpTy(
+    //             BUILTIN_FREXP_MANT_ComputeFpTy(ExName), NewExp);
     Type *Ty = Src->getType();
     Type *ExTy = B.getInt32Ty();
     Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src);
@@ -161,10 +159,11 @@ class FRemExpander {
                                  PHINode *RetPhi) const {
     // Build:
     // ex = BUILTIN_FREXP_EXP_ComputeFpTy(ax) - 1;
-    // ax = BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ax),
-    // bits); ey = BUILTIN_FREXP_EXP_ComputeFpTy(ay) - 1; ay =
-    // BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ay), 1); auto
-    // [Ax, Ex]{getFrexpResults(B, AxInitial)};
+    // ax = BUILTIN_FLDEXP_ComputeFpTy(
+    //      BUILTIN_FREXP_MANT_ComputeFpTy(ax), bits);
+    // ey = BUILTIN_FREXP_EXP_ComputeFpTy(ay) - 1;
+    // ay = BUILTIN_FLDEXP_ComputeFpTy(
+    //      BUILTIN_FREXP_MANT_ComputeFpTy(ay), 1);
     auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax");
     auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay");
 
@@ -218,7 +217,7 @@ class FRemExpander {
         AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), "ax");
     AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
 
-    // Adjust exponent and sign
+    // Build:
     //    ax = BUILTIN_FLDEXP_ComputeFpTy(ax, ey);
     //    ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax));
     AxFinal = createLdexp(AxFinal, Ey, "ax");
@@ -257,7 +256,8 @@ class FRemExpander {
     // Build:
     //   ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
     //   bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
-    //   BUILTIN_ISFINITE_ComputeFpTy(x); ret = c ? ret : QNAN_ComputeFpTy;
+    //   BUILTIN_ISFINITE_ComputeFpTy(x);
+    //   ret = c ? ret : QNAN_ComputeFpTy;
     // TODO Handle NaN and infinity fast math flags separately here?
     Value *Nan = ConstantFP::getQNaN(FremTy);
 

>From 58a2edb4bae94e19b476802a56a02c051abd2a62 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 12 Mar 2025 12:38:28 -0400
Subject: [PATCH 03/19] clang-format changes

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b2b136c984bf4..ab955e7eee2a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,7 +388,6 @@ class AMDGPUTargetLowering : public TargetLowering {
     return MVT::i32;
   }
   bool shouldExpandFRemInIR() const override { return true; };
-
 };
 
 namespace AMDGPUISD {

>From f3f0a8f41da3ad9d3ad3ff1e1b5abde50a69967c Mon Sep 17 00:00:00 2001
From: Frederik Harwath <frederik at harwath.name>
Date: Thu, 13 Mar 2025 10:53:52 +0100
Subject: [PATCH 04/19] Apply suggestions from code review

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 0bef9abc0eac3..fa9bfab03bdf7 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -387,7 +387,7 @@ static bool expandFRem(BinaryOperator &I) {
   if (ReturnTy->isFloatingPointTy())
     Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1));
   else {
-    auto VecTy = cast<FixedVectorType>(ReturnTy);
+    auto *VecTy = cast<FixedVectorType>(ReturnTy);
 
     // This could use SplitBlockAndInsertForEachLane but the interface
     // is a bit awkward for a constant number of elements and it will
@@ -406,8 +406,7 @@ static bool expandFRem(BinaryOperator &I) {
 
   I.replaceAllUsesWith(Ret);
   Ret->takeName(&I);
-  I.removeFromParent();
-  I.dropAllReferences();
+  I.eraseFromParent();
 
   return true;
 }
@@ -1036,7 +1035,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
   while (!Replace.empty()) {
     Instruction *I = Replace.pop_back_val();
     if (I->getOpcode() == Instruction::FRem)
-      expandFRem(llvm::cast<BinaryOperator>(*I));
+      expandFRem(cast<BinaryOperator>(*I));
     else if (I->getOpcode() == Instruction::FPToUI ||
              I->getOpcode() == Instruction::FPToSI) {
       expandFPToI(I);

>From 9eaad61169a55142cdd6fda3f78ddb2ddf0b7cd3 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 13 Mar 2025 07:32:39 -0400
Subject: [PATCH 05/19] Rename ExpandLargeFpConvertPass to ExpandFpPass

This is meant as a preparation for PR #130988 "[AMDGPU] Implement IR expansion for frem instruction" which implements the expansion of another instruction in this pass. The more general name seems more appropriate given this change and quite reasonable even without it.

The renaming of the source files happens in the following commit for a
better diff.

* llvm/lib/CodeGen/ExpandLargeFpConvert.cpp:
---
 llvm/docs/WritingAnLLVMPass.rst               |  2 +-
 .../llvm/CodeGen/ExpandLargeFpConvert.h       | 14 +++----
 llvm/include/llvm/CodeGen/Passes.h            |  2 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    | 13 ++++---
 llvm/include/llvm/InitializePasses.h          |  2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  4 +-
 .../llvm/Passes/MachinePassRegistry.def       |  2 +-
 llvm/lib/CodeGen/CMakeLists.txt               |  2 +-
 llvm/lib/CodeGen/CodeGen.cpp                  |  2 +-
 llvm/lib/CodeGen/ExpandLargeFpConvert.cpp     | 39 +++++++++----------
 llvm/lib/CodeGen/TargetPassConfig.cpp         |  2 +-
 llvm/lib/Passes/PassBuilder.cpp               |  2 +-
 llvm/lib/Passes/PassRegistry.def              |  2 +-
 llvm/test/CodeGen/AArch64/O0-pipeline.ll      |  2 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |  2 +-
 llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll     |  2 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      | 10 ++---
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |  2 +-
 llvm/test/CodeGen/LoongArch/O0-pipeline.ll    |  2 +-
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |  2 +-
 llvm/test/CodeGen/M68k/pipeline.ll            |  2 +-
 llvm/test/CodeGen/PowerPC/O0-pipeline.ll      |  2 +-
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |  2 +-
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |  2 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |  2 +-
 llvm/test/CodeGen/X86/O0-pipeline.ll          |  2 +-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |  2 +-
 .../X86/expand-large-fp-convert-fptosi129.ll  |  4 +-
 .../X86/expand-large-fp-convert-fptoui129.ll  |  4 +-
 .../X86/expand-large-fp-convert-si129tofp.ll  |  4 +-
 .../X86/expand-large-fp-convert-ui129tofp.ll  |  4 +-
 llvm/tools/opt/optdriver.cpp                  |  4 +-
 .../gn/secondary/llvm/lib/CodeGen/BUILD.gn    |  2 +-
 33 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/llvm/docs/WritingAnLLVMPass.rst b/llvm/docs/WritingAnLLVMPass.rst
index 31194e8b0389c..484227bac38b5 100644
--- a/llvm/docs/WritingAnLLVMPass.rst
+++ b/llvm/docs/WritingAnLLVMPass.rst
@@ -652,7 +652,7 @@ default optimization pipelines, e.g. (the output has been trimmed):
   Pre-ISel Intrinsic Lowering
   FunctionPass Manager
     Expand large div/rem
-    Expand large fp convert
+    Expand fp
     Expand Atomic instructions
   SVE intrinsics optimizations
     FunctionPass Manager
diff --git a/llvm/include/llvm/CodeGen/ExpandLargeFpConvert.h b/llvm/include/llvm/CodeGen/ExpandLargeFpConvert.h
index 72e31f04209dd..c7213981d5926 100644
--- a/llvm/include/llvm/CodeGen/ExpandLargeFpConvert.h
+++ b/llvm/include/llvm/CodeGen/ExpandLargeFpConvert.h
@@ -1,4 +1,4 @@
-//===- ExpandLargeFpConvert.h -----------------------------------*- C++ -*-===//
+//===- ExpandFp.h -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_EXPANDLARGEFPCONVERT_H
-#define LLVM_CODEGEN_EXPANDLARGEFPCONVERT_H
+#ifndef LLVM_CODEGEN_EXPANDFP_H
+#define LLVM_CODEGEN_EXPANDFP_H
 
 #include "llvm/IR/PassManager.h"
 
@@ -15,17 +15,17 @@ namespace llvm {
 
 class TargetMachine;
 
-class ExpandLargeFpConvertPass
-    : public PassInfoMixin<ExpandLargeFpConvertPass> {
+class ExpandFpPass
+    : public PassInfoMixin<ExpandFpPass> {
 private:
   const TargetMachine *TM;
 
 public:
-  explicit ExpandLargeFpConvertPass(const TargetMachine *TM_) : TM(TM_) {}
+  explicit ExpandFpPass(const TargetMachine *TM_) : TM(TM_) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 } // end namespace llvm
 
-#endif // LLVM_CODEGEN_EXPANDLARGEFPCONVERT_H
+#endif // LLVM_CODEGEN_EXPANDFP_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index dbd61d6b2b2a8..e5cb028b25dd9 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -529,7 +529,7 @@ namespace llvm {
   FunctionPass *createExpandLargeDivRemPass();
 
   // Expands large div/rem instructions.
-  FunctionPass *createExpandLargeFpConvertPass();
+  FunctionPass *createExpandFpPass();
 
   // This pass expands memcmp() to load/stores.
   FunctionPass *createExpandMemCmpLegacyPass();
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b64c57fdba992..c07cfcdcb939c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2148,8 +2148,8 @@ class TargetLoweringBase {
     return MaxDivRemBitWidthSupported;
   }
 
-  /// Returns the size in bits of the maximum larget fp convert the backend
-  /// supports. Larger operations will be expanded by ExpandLargeFPConvert.
+  /// Returns the size in bits of the maximum fp to/from int conversion the
+  /// backend supports. Larger operations will be expanded by ExpandFp.
   unsigned getMaxLargeFPConvertBitWidthSupported() const {
     return MaxLargeFPConvertBitWidthSupported;
   }
@@ -2782,8 +2782,8 @@ class TargetLoweringBase {
     MaxDivRemBitWidthSupported = SizeInBits;
   }
 
-  /// Set the size in bits of the maximum fp convert the backend supports.
-  /// Larger operations will be expanded by ExpandLargeFPConvert.
+  /// Set the size in bits of the maximum fp to/from int conversion the backend
+  /// supports. Larger operations will be expanded by ExpandFp.
   void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits) {
     MaxLargeFPConvertBitWidthSupported = SizeInBits;
   }
@@ -3580,8 +3580,9 @@ class TargetLoweringBase {
   /// Larger operations will be expanded by ExpandLargeDivRem.
   unsigned MaxDivRemBitWidthSupported;
 
-  /// Size in bits of the maximum larget fp convert size the backend
-  /// supports. Larger operations will be expanded by ExpandLargeFPConvert.
+  /// Size in bits of the maximum fp to/from int conversion size the
+  /// backend supports. Larger operations will be expanded by
+  /// ExpandFp.
   unsigned MaxLargeFPConvertBitWidthSupported;
 
   /// Size in bits of the minimum cmpxchg or ll/sc operation the
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 36be3d552f556..7e8c0e91923c0 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -105,7 +105,7 @@ void initializeEarlyMachineLICMPass(PassRegistry &);
 void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
 void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
 void initializeEHContGuardTargetsPass(PassRegistry &);
-void initializeExpandLargeFpConvertLegacyPassPass(PassRegistry &);
+void initializeExpandFpLegacyPassPass(PassRegistry &);
 void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
 void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
 void initializeExpandPostRALegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 90a0cdf803560..0a1ded998d338 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -30,7 +30,7 @@
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
-#include "llvm/CodeGen/ExpandLargeFpConvert.h"
+#include "llvm/CodeGen/ExpandFp.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandPostRAPseudos.h"
 #include "llvm/CodeGen/ExpandReductions.h"
@@ -661,7 +661,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPasses(
 
   addPass(PreISelIntrinsicLoweringPass(&TM));
   addPass(ExpandLargeDivRemPass(&TM));
-  addPass(ExpandLargeFpConvertPass(&TM));
+  addPass(ExpandFpPass(&TM));
 
   derived().addIRPasses(addPass);
   derived().addCodeGenPrepare(addPass);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index ebfdaf82169d8..756d308f0b009 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -52,7 +52,7 @@ FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(false))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
-FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass(TM))
+FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
 FUNCTION_PASS("gc-lowering", GCLoweringPass())
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 0c92637a75e77..0e237ba31a8ca 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -57,7 +57,7 @@ add_llvm_component_library(LLVMCodeGen
   EHContGuardTargets.cpp
   ExecutionDomainFix.cpp
   ExpandLargeDivRem.cpp
-  ExpandLargeFpConvert.cpp
+  ExpandFp.cpp
   ExpandMemCmp.cpp
   ExpandPostRAPseudos.cpp
   ExpandReductions.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index b36d2f743d512..4afeb08a57f60 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -40,7 +40,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeEarlyMachineLICMPass(Registry);
   initializeEarlyTailDuplicateLegacyPass(Registry);
   initializeExpandLargeDivRemLegacyPassPass(Registry);
-  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandFpLegacyPassPass(Registry);
   initializeExpandMemCmpLegacyPassPass(Registry);
   initializeExpandPostRALegacyPass(Registry);
   initializeFEntryInserterPass(Registry);
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index fa9bfab03bdf7..4db992f8d5862 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -1,21 +1,20 @@
-//===--- ExpandLargeFpConvert.cpp - Expand large fp convert----------------===//
+//===--- ExpandFp.cpp - Expand fp instructions ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// This pass expands certain floating point instructions at the IR level.
 //
-// This pass expands ‘fptoui .. to’, ‘fptosi .. to’, ‘uitofp .. to’,
-// ‘sitofp .. to’ instructions with a bitwidth above a threshold into
-// auto-generated functions. This is useful for targets like x86_64 that cannot
-// lower fp convertions with more than 128 bits.
-// Furthermore, the pass can expand FRem instructions if requested in the
-// TargetLowering for the current target.
+// It expands ‘fptoui .. to’, ‘fptosi .. to’, ‘uitofp ..  to’, ‘sitofp
+// .. to’ instructions with a bitwidth above a threshold.  This is
+// useful for targets like x86_64 that cannot lower fp convertions
+// with more than 128 bits.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/ExpandLargeFpConvert.h"
+#include "llvm/CodeGen/ExpandFp.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/CodeGen/Passes.h"
@@ -38,7 +37,7 @@ using namespace llvm;
 static cl::opt<unsigned>
     ExpandFpConvertBits("expand-fp-convert-bits", cl::Hidden,
                      cl::init(llvm::IntegerType::MAX_INT_BITS),
-                     cl::desc("fp convert instructions on integers with "
+                     cl::desc("fp  convert instructions on integers with "
                               "more than <N> bits are expanded."));
 
 namespace {
@@ -1048,12 +1047,12 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 }
 
 namespace {
-class ExpandLargeFpConvertLegacyPass : public FunctionPass {
+class ExpandFpLegacyPass : public FunctionPass {
 public:
   static char ID;
 
-  ExpandLargeFpConvertLegacyPass() : FunctionPass(ID) {
-    initializeExpandLargeFpConvertLegacyPassPass(
+  ExpandFpLegacyPass() : FunctionPass(ID) {
+    initializeExpandFpLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
 
@@ -1071,19 +1070,19 @@ class ExpandLargeFpConvertLegacyPass : public FunctionPass {
 };
 } // namespace
 
-PreservedAnalyses ExpandLargeFpConvertPass::run(Function &F,
+PreservedAnalyses ExpandFpPass::run(Function &F,
                                                 FunctionAnalysisManager &FAM) {
   const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F);
   return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none()
                                                : PreservedAnalyses::all();
 }
 
-char ExpandLargeFpConvertLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ExpandLargeFpConvertLegacyPass, "expand-large-fp-convert",
-                      "Expand large fp convert", false, false)
-INITIALIZE_PASS_END(ExpandLargeFpConvertLegacyPass, "expand-large-fp-convert",
-                    "Expand large fp convert", false, false)
+char ExpandFpLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
+                      "Expand certain fp instructions", false, false)
+INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp",
+                    "Expand fp", false, false)
 
-FunctionPass *llvm::createExpandLargeFpConvertPass() {
-  return new ExpandLargeFpConvertLegacyPass();
+FunctionPass *llvm::createExpandFpPass() {
+  return new ExpandFpLegacyPass();
 }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index ea5e43ff12166..f788ec5ecb15b 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1070,7 +1070,7 @@ bool TargetPassConfig::addISelPasses() {
   PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
   addPass(createPreISelIntrinsicLoweringPass());
   addPass(createExpandLargeDivRemPass());
-  addPass(createExpandLargeFpConvertPass());
+  addPass(createExpandFpPass());
   addIRPasses();
   addCodeGenPrepare();
   addPassesToHandleExceptions();
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 2e62d6e6f4cc6..8e4b73b383a73 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -89,7 +89,7 @@
 #include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
-#include "llvm/CodeGen/ExpandLargeFpConvert.h"
+#include "llvm/CodeGen/ExpandFp.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandPostRAPseudos.h"
 #include "llvm/CodeGen/FinalizeISel.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index bfd952df25e98..2b86e0420cf91 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -367,7 +367,7 @@ FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
-FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass(TM))
+FUNCTION_PASS("expand-fp", ExpandFpPass(TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("extra-vector-passes",
                   ExtraFunctionPassManager<ShouldRunExtraVectorPasses>())
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index 0d079881cb909..abc67eec32391 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -16,7 +16,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 49a86134411d6..e1481667a4ab7 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:     SVE intrinsics optimizations
 ; CHECK-NEXT:       FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
index 44139fafbfe20..e47b98021a68a 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2>&1 | FileCheck -check-prefix=GISEL %s
 
-; FIXME: GISEL can't handle the "fptrunc float to bfloat" that expand-large-fp-convert emits.
+; FIXME: GISEL can't handle the "fptrunc float to bfloat" that -expand-fp emits.
 
 ; GISEL: unable to translate instruction: fptrunc
 
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 9aca7a5fc741f..4b6cc32522f5b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -27,7 +27,7 @@
 ; GCN-O0-NEXT:    Pre-ISel Intrinsic Lowering
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Expand large div/rem
-; GCN-O0-NEXT:      Expand large fp convert
+; GCN-O0-NEXT:      Expand fp
 ; GCN-O0-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O0-NEXT:    AMDGPU Printf lowering
 ; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
@@ -177,7 +177,7 @@
 ; GCN-O1-NEXT:    Pre-ISel Intrinsic Lowering
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Expand large div/rem
-; GCN-O1-NEXT:      Expand large fp convert
+; GCN-O1-NEXT:      Expand fp
 ; GCN-O1-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
@@ -462,7 +462,7 @@
 ; GCN-O1-OPTS-NEXT:    Pre-ISel Intrinsic Lowering
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Expand large div/rem
-; GCN-O1-OPTS-NEXT:      Expand large fp convert
+; GCN-O1-OPTS-NEXT:      Expand fp
 ; GCN-O1-OPTS-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
@@ -775,7 +775,7 @@
 ; GCN-O2-NEXT:    Pre-ISel Intrinsic Lowering
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Expand large div/rem
-; GCN-O2-NEXT:      Expand large fp convert
+; GCN-O2-NEXT:      Expand fp
 ; GCN-O2-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O2-NEXT:    AMDGPU Printf lowering
 ; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
@@ -1094,7 +1094,7 @@
 ; GCN-O3-NEXT:    Pre-ISel Intrinsic Lowering
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Expand large div/rem
-; GCN-O3-NEXT:      Expand large fp convert
+; GCN-O3-NEXT:      Expand fp
 ; GCN-O3-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O3-NEXT:    AMDGPU Printf lowering
 ; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 1840b5ce46c6f..960d7305e66f6 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT:    Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:    FunctionPass Manager
 ; CHECK-NEXT:      Expand large div/rem
-; CHECK-NEXT:      Expand large fp convert
+; CHECK-NEXT:      Expand fp
 ; CHECK-NEXT:      Expand Atomic instructions
 ; CHECK-NEXT:      Simplify the CFG
 ; CHECK-NEXT:      Dominator Tree Construction
diff --git a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
index 24bd4c75a9821..d16cb1c15870b 100644
--- a/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/O0-pipeline.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index ab76d4e998d2b..90d994909264a 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -32,7 +32,7 @@
 ; LAXX-NEXT:     Pre-ISel Intrinsic Lowering
 ; LAXX-NEXT:     FunctionPass Manager
 ; LAXX-NEXT:       Expand large div/rem
-; LAXX-NEXT:       Expand large fp convert
+; LAXX-NEXT:       Expand fp
 ; LAXX-NEXT:       Expand Atomic instructions
 ; LAXX-NEXT:       Module Verifier
 ; LAXX-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll
index d61e591505e59..deaaffa907eb1 100644
--- a/llvm/test/CodeGen/M68k/pipeline.ll
+++ b/llvm/test/CodeGen/M68k/pipeline.ll
@@ -3,7 +3,7 @@
 ; CHECK-NEXT:    Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:    FunctionPass Manager
 ; CHECK-NEXT:      Expand large div/rem
-; CHECK-NEXT:      Expand large fp convert
+; CHECK-NEXT:      Expand fp
 ; CHECK-NEXT:      Expand Atomic instructions
 ; CHECK-NEXT:      Module Verifier
 ; CHECK-NEXT:      Dominator Tree Construction
diff --git a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
index 5853647bf3b9f..38b1074e55d22 100644
--- a/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O0-pipeline.ll
@@ -17,7 +17,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:     PPC Lower MASS Entries
 ; CHECK-NEXT:     FunctionPass Manager
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 3920d75c83ffe..7cbb1a1c98873 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Convert i1 constants to i32/i64 if they are returned
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:     PPC Lower MASS Entries
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index f93cb65897210..694662eab1681 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       RISC-V Zacas ABI fix 
 ; CHECK-NEXT:       Module Verifier
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 976d1ee003a1f..beef7a574dc4f 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -24,7 +24,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       RISC-V Zacas ABI fix 
 ; CHECK-NEXT:       Dominator Tree Construction
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 4c99dd830b442..6d824f8b510af 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -18,7 +18,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Lower AMX intrinsics
 ; CHECK-NEXT:       Lower AMX type for load/store
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 203be56751d09..d72f517cfb603 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -28,7 +28,7 @@
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Expand large div/rem
-; CHECK-NEXT:       Expand large fp convert
+; CHECK-NEXT:       Expand fp
 ; CHECK-NEXT:       Expand Atomic instructions
 ; CHECK-NEXT:       Lower AMX intrinsics
 ; CHECK-NEXT:       Lower AMX type for load/store
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
index e058c5bb4aa05..f5bf8bb61a16e 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=x86_64-- -expand-large-fp-convert < %s | FileCheck %s
-; RUN: opt -S -mtriple=x86_64-- -passes=expand-large-fp-convert < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- --expand-fp < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- -passes=expand-fp < %s | FileCheck %s
 
 define i129 @halftosi129(half %a) {
 ; CHECK-LABEL: @halftosi129(
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
index c699f805754cc..94ed32abe46f8 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=x86_64-- -expand-large-fp-convert < %s | FileCheck %s
-; RUN: opt -S -mtriple=x86_64-- -passes=expand-large-fp-convert < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- --expand-fp < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- -passes=expand-fp < %s | FileCheck %s
 
 define i129 @halftoui129(half %a) {
 ; CHECK-LABEL: @halftoui129(
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
index f70ce2f85f65b..8820b873f3818 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=x86_64-- -expand-large-fp-convert < %s | FileCheck %s
-; RUN: opt -S -mtriple=x86_64-- -passes=expand-large-fp-convert < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- --expand-fp < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- -passes=expand-fp < %s | FileCheck %s
 
 define half @si129tohalf(i129 %a) {
 ; CHECK-LABEL: @si129tohalf(
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
index ee54d53e9ba03..b58d88bc02c79 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=x86_64-- -expand-large-fp-convert < %s | FileCheck %s
-; RUN: opt -S -mtriple=x86_64-- -passes=expand-large-fp-convert < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- --expand-fp < %s | FileCheck %s
+; RUN: opt -S -mtriple=x86_64-- -passes=expand-fp < %s | FileCheck %s
 
 define half @ui129tohalf(i129 %a) {
 ; CHECK-LABEL: @ui129tohalf(
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 4759d03ba80d7..880f1fc664468 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -373,7 +373,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "expand-large-div-rem",
       "structurizecfg",
       "fix-irreducible",
-      "expand-large-fp-convert",
+      "expand-fp",
       "callbrprepare",
       "scalarizer",
   };
@@ -425,7 +425,7 @@ extern "C" int optMain(
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeExpandLargeDivRemLegacyPassPass(Registry);
-  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandFpLegacyPassPass(Registry);
   initializeExpandMemCmpLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSelectOptimizePass(Registry);
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 548ac41f43e5c..b99676b52aea7 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -57,7 +57,7 @@ static_library("CodeGen") {
     "EdgeBundles.cpp",
     "ExecutionDomainFix.cpp",
     "ExpandLargeDivRem.cpp",
-    "ExpandLargeFpConvert.cpp",
+    "ExpandFp.cpp",
     "ExpandMemCmp.cpp",
     "ExpandPostRAPseudos.cpp",
     "ExpandReductions.cpp",

>From ea49f29e0696b2631f7425ca12e46b53aadf5045 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 13 Mar 2025 07:42:30 -0400
Subject: [PATCH 06/19] Rename ExpandLargeFpConvert.{cpp,h}

---
 .../include/llvm/CodeGen/{ExpandLargeFpConvert.h => ExpandFp.h} | 0
 llvm/lib/CodeGen/{ExpandLargeFpConvert.cpp => ExpandFp.cpp}     | 2 +-
 llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll                       | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename llvm/include/llvm/CodeGen/{ExpandLargeFpConvert.h => ExpandFp.h} (100%)
 rename llvm/lib/CodeGen/{ExpandLargeFpConvert.cpp => ExpandFp.cpp} (99%)

diff --git a/llvm/include/llvm/CodeGen/ExpandLargeFpConvert.h b/llvm/include/llvm/CodeGen/ExpandFp.h
similarity index 100%
rename from llvm/include/llvm/CodeGen/ExpandLargeFpConvert.h
rename to llvm/include/llvm/CodeGen/ExpandFp.h
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
similarity index 99%
rename from llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
rename to llvm/lib/CodeGen/ExpandFp.cpp
index 4db992f8d5862..1700af7e5ce1b 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 static cl::opt<unsigned>
     ExpandFpConvertBits("expand-fp-convert-bits", cl::Hidden,
                      cl::init(llvm::IntegerType::MAX_INT_BITS),
-                     cl::desc("fp  convert instructions on integers with "
+                     cl::desc("fp convert instructions on integers with "
                               "more than <N> bits are expanded."));
 
 namespace {
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
index e47b98021a68a..c001df48499c7 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2>&1 | FileCheck -check-prefix=GISEL %s
 
-; FIXME: GISEL can't handle the "fptrunc float to bfloat" that -expand-fp emits.
+; FIXME: GISEL can't handle the "fptrunc float to bfloat" that expand-fp emits.
 
 ; GISEL: unable to translate instruction: fptrunc
 

>From 80301aa05da886492aed4f76ad891cb05d75b33a Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 13 Mar 2025 07:56:32 -0400
Subject: [PATCH 07/19] clang-format changes

Adjust line length
---
 llvm/include/llvm/CodeGen/ExpandFp.h          |  3 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  2 +-
 llvm/lib/CodeGen/ExpandFp.cpp                 | 50 +++++++++----------
 llvm/lib/Passes/PassBuilder.cpp               |  2 +-
 4 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ExpandFp.h b/llvm/include/llvm/CodeGen/ExpandFp.h
index c7213981d5926..c13119a4238ef 100644
--- a/llvm/include/llvm/CodeGen/ExpandFp.h
+++ b/llvm/include/llvm/CodeGen/ExpandFp.h
@@ -15,8 +15,7 @@ namespace llvm {
 
 class TargetMachine;
 
-class ExpandFpPass
-    : public PassInfoMixin<ExpandFpPass> {
+class ExpandFpPass : public PassInfoMixin<ExpandFpPass> {
 private:
   const TargetMachine *TM;
 
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 0a1ded998d338..98d74f0069252 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -29,8 +29,8 @@
 #include "llvm/CodeGen/DetectDeadLanes.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/EarlyIfConversion.h"
-#include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandFp.h"
+#include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandPostRAPseudos.h"
 #include "llvm/CodeGen/ExpandReductions.h"
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 1700af7e5ce1b..311dacf120d98 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -1,4 +1,4 @@
-//===--- ExpandFp.cpp - Expand fp instructions ----------------===//
+//===--- ExpandFp.cpp - Expand fp instructions ----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -36,9 +36,9 @@ using namespace llvm;
 
 static cl::opt<unsigned>
     ExpandFpConvertBits("expand-fp-convert-bits", cl::Hidden,
-                     cl::init(llvm::IntegerType::MAX_INT_BITS),
-                     cl::desc("fp convert instructions on integers with "
-                              "more than <N> bits are expanded."));
+                        cl::init(llvm::IntegerType::MAX_INT_BITS),
+                        cl::desc("fp convert instructions on integers with "
+                                 "more than <N> bits are expanded."));
 
 namespace {
 /// This class implements a precise expansion of the frem instruction.
@@ -435,8 +435,8 @@ static bool expandFRem(BinaryOperator &I) {
 ///   br i1 %cmp6.not, label %if.end12, label %if.then8
 ///
 /// if.then8:                                         ; preds = %if.end
-///   %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808
-///   br label %cleanup
+///   %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64
+///   -9223372036854775808 br label %cleanup
 ///
 /// if.end12:                                         ; preds = %if.end
 ///   %cmp13 = icmp ult i64 %shr, 150
@@ -454,9 +454,10 @@ static bool expandFRem(BinaryOperator &I) {
 ///   %mul19 = mul nsw i64 %shl, %conv
 ///   br label %cleanup
 ///
-/// cleanup:                                          ; preds = %entry, %if.else, %if.then15, %if.then8
-///   %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ]
-///   ret i64 %retval.0
+/// cleanup:                                          ; preds = %entry,
+/// %if.else, %if.then15, %if.then8
+///   %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [
+///   %mul19, %if.else ], [ 0, %entry ] ret i64 %retval.0
 /// }
 ///
 /// Replace fp to integer with generated code.
@@ -640,13 +641,11 @@ static void expandFPToI(Instruction *FPToI) {
 ///   %or = or i64 %shr6, %conv11
 ///   br label %sw.epilog
 ///
-/// sw.epilog:                                        ; preds = %sw.default, %if.then4, %sw.bb
-///   %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ]
-///   %1 = lshr i64 %a.addr.0, 2
-///   %2 = and i64 %1, 1
-///   %or16 = or i64 %2, %a.addr.0
-///   %inc = add nsw i64 %or16, 1
-///   %3 = and i64 %inc, 67108864
+/// sw.epilog:                                        ; preds = %sw.default,
+/// %if.then4, %sw.bb
+///   %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl,
+///   %sw.bb ] %1 = lshr i64 %a.addr.0, 2 %2 = and i64 %1, 1 %or16 = or i64 %2,
+///   %a.addr.0 %inc = add nsw i64 %or16, 1 %3 = and i64 %inc, 67108864
 ///   %tobool.not = icmp eq i64 %3, 0
 ///   %spec.select.v = select i1 %tobool.not, i64 2, i64 3
 ///   %spec.select = ashr i64 %inc, %spec.select.v
@@ -659,7 +658,8 @@ static void expandFPToI(Instruction *FPToI) {
 ///   %shl25 = shl i64 %sub, %sh_prom24
 ///   br label %if.end26
 ///
-/// if.end26:                                         ; preds = %sw.epilog, %if.else
+/// if.end26:                                         ; preds = %sw.epilog,
+/// %if.else
 ///   %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ]
 ///   %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ]
 ///   %conv27 = trunc i64 %shr to i32
@@ -673,7 +673,8 @@ static void expandFPToI(Instruction *FPToI) {
 ///   %4 = bitcast i32 %or33 to float
 ///   br label %return
 ///
-/// return:                                           ; preds = %entry, %if.end26
+/// return:                                           ; preds = %entry,
+/// %if.end26
 ///   %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ]
 ///   ret float %retval.0
 /// }
@@ -1052,8 +1053,7 @@ class ExpandFpLegacyPass : public FunctionPass {
   static char ID;
 
   ExpandFpLegacyPass() : FunctionPass(ID) {
-    initializeExpandFpLegacyPassPass(
-        *PassRegistry::getPassRegistry());
+    initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnFunction(Function &F) override {
@@ -1070,8 +1070,7 @@ class ExpandFpLegacyPass : public FunctionPass {
 };
 } // namespace
 
-PreservedAnalyses ExpandFpPass::run(Function &F,
-                                                FunctionAnalysisManager &FAM) {
+PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) {
   const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F);
   return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none()
                                                : PreservedAnalyses::all();
@@ -1080,9 +1079,6 @@ PreservedAnalyses ExpandFpPass::run(Function &F,
 char ExpandFpLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
                       "Expand certain fp instructions", false, false)
-INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp",
-                    "Expand fp", false, false)
+INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false)
 
-FunctionPass *llvm::createExpandFpPass() {
-  return new ExpandFpLegacyPass();
-}
+FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); }
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8e4b73b383a73..4a37678a5aa4d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -88,8 +88,8 @@
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/EdgeBundles.h"
-#include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandFp.h"
+#include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandPostRAPseudos.h"
 #include "llvm/CodeGen/FinalizeISel.h"

>From 1812c7f216f9bd0e36a7aa2961e36cfbc2d47510 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <frederik.harwath at amd.com>
Date: Fri, 14 Mar 2025 13:20:58 +0100
Subject: [PATCH 08/19] Add IRBuilder::CreateFMA (#131112)

This commit adds a function for creating fma intrinsic calls to the IRBuilder.  If the "IsFPConstrained" flag of the builder is set,
the function creates a call to "experimental.constrained.fma" instead of "llvm.fma" .
To support the creation of the constrained intrinsic, a function "CreateConstrainedFPIntrinsic" is introduced.
---
 llvm/include/llvm/IR/IRBuilder.h    | 24 ++++++++++++++++++++++++
 llvm/lib/IR/AutoUpgrade.cpp         |  4 ++--
 llvm/lib/IR/IRBuilder.cpp           | 20 ++++++++++++++++++++
 llvm/unittests/IR/IRBuilderTest.cpp | 24 ++++++++++++++++++------
 4 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 67e357c600d3b..750a99cc50dd7 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -1065,6 +1065,19 @@ class IRBuilderBase {
                            {Src, Exp}, FMFSource, Name);
   }
 
+  /// Create call to the fma intrinsic.
+  Value *CreateFMA(Value *Factor1, Value *Factor2, Value *Summand,
+                   FMFSource FMFSource = {}, const Twine &Name = "") {
+    if (IsFPConstrained) {
+      return CreateConstrainedFPIntrinsic(
+          Intrinsic::experimental_constrained_fma, {Factor1->getType()},
+          {Factor1, Factor2, Summand}, FMFSource, Name);
+    }
+
+    return CreateIntrinsic(Intrinsic::fma, {Factor1->getType()},
+                           {Factor1, Factor2, Summand}, FMFSource, Name);
+  }
+
   /// Create a call to the arithmetic_fence intrinsic.
   CallInst *CreateArithmeticFence(Value *Val, Type *DstType,
                                   const Twine &Name = "") {
@@ -1723,6 +1736,17 @@ class IRBuilderBase {
     return Accum;
   }
 
+  /// This function is like @ref CreateIntrinsic for constrained fp
+  /// intrinsics. It sets the rounding mode and exception behavior of
+  /// the created intrinsic call according to \p Rounding and \p
+  /// Except and it sets \p FPMathTag as the 'fpmath' metadata, using
+  /// defaults if a value equals nullopt/null.
+  CallInst *CreateConstrainedFPIntrinsic(
+      Intrinsic::ID ID, ArrayRef<Type *> Types, ArrayRef<Value *> Args,
+      FMFSource FMFSource, const Twine &Name, MDNode *FPMathTag = nullptr,
+      std::optional<RoundingMode> Rounding = std::nullopt,
+      std::optional<fp::ExceptionBehavior> Except = std::nullopt);
+
   CallInst *CreateConstrainedFPBinOp(
       Intrinsic::ID ID, Value *L, Value *R, FMFSource FMFSource = {},
       const Twine &Name = "", MDNode *FPMathTag = nullptr,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index cb4ecc60aa473..ce3b2c90a41a1 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -3755,7 +3755,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_vfmadd_f32;
       Rep = Builder.CreateIntrinsic(IID, {}, Ops);
     } else {
-      Rep = Builder.CreateIntrinsic(Intrinsic::fma, A->getType(), {A, B, C});
+      Rep = Builder.CreateFMA(A, B, C);
     }
 
     Value *PassThru = IsMaskZ   ? Constant::getNullValue(Rep->getType())
@@ -3808,7 +3808,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
       Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)});
     } else {
-      Rep = Builder.CreateIntrinsic(Intrinsic::fma, A->getType(), {A, B, C});
+      Rep = Builder.CreateFMA(A, B, C);
     }
 
     Value *PassThru = IsMaskZ   ? llvm::Constant::getNullValue(CI->getType())
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 134459265cecb..421b617a5fb7e 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -950,6 +950,26 @@ CallInst *IRBuilderBase::CreateConstrainedFPBinOp(
   return C;
 }
 
+CallInst *IRBuilderBase::CreateConstrainedFPIntrinsic(
+    Intrinsic::ID ID, ArrayRef<Type *> Types, ArrayRef<Value *> Args,
+    FMFSource FMFSource, const Twine &Name, MDNode *FPMathTag,
+    std::optional<RoundingMode> Rounding,
+    std::optional<fp::ExceptionBehavior> Except) {
+  Value *RoundingV = getConstrainedFPRounding(Rounding);
+  Value *ExceptV = getConstrainedFPExcept(Except);
+
+  FastMathFlags UseFMF = FMFSource.get(FMF);
+
+  llvm::SmallVector<Value *, 5> ExtArgs(Args);
+  ExtArgs.push_back(RoundingV);
+  ExtArgs.push_back(ExceptV);
+
+  CallInst *C = CreateIntrinsic(ID, Types, ExtArgs, nullptr, Name);
+  setConstrainedFPCallAttr(C);
+  setFPAttrs(C, FPMathTag, UseFMF);
+  return C;
+}
+
 CallInst *IRBuilderBase::CreateConstrainedFPUnroundedBinOp(
     Intrinsic::ID ID, Value *L, Value *R, FMFSource FMFSource,
     const Twine &Name, MDNode *FPMathTag,
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 3a55d88f03d49..e9e9d7b11a36c 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -109,25 +109,23 @@ TEST_F(IRBuilderTest, Intrinsics) {
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Result = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V});
+  Result = Builder.CreateFMA(V, V, V);
   II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Result =
-      Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
+  Result = Builder.CreateFMA(V, V, V, I);
   II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Result =
-      Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
+  Result = Builder.CreateFMA(V, V, V, FastMathFlags::getFast());
   II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
-  EXPECT_FALSE(II->hasNoNaNs());
+  EXPECT_TRUE(II->hasNoNaNs());
 
   Result = Builder.CreateUnaryIntrinsic(Intrinsic::roundeven, V);
   II = cast<IntrinsicInst>(Result);
@@ -307,6 +305,11 @@ TEST_F(IRBuilderTest, ConstrainedFP) {
   II = cast<IntrinsicInst>(V);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::experimental_constrained_frem);
 
+  V = Builder.CreateFMA(V, V, V);
+  ASSERT_TRUE(isa<IntrinsicInst>(V));
+  II = cast<IntrinsicInst>(V);
+  EXPECT_EQ(II->getIntrinsicID(), Intrinsic::experimental_constrained_fma);
+
   VInt = Builder.CreateFPToUI(VDouble, Builder.getInt32Ty());
   ASSERT_TRUE(isa<IntrinsicInst>(VInt));
   II = cast<IntrinsicInst>(VInt);
@@ -398,6 +401,15 @@ TEST_F(IRBuilderTest, ConstrainedFP) {
   EXPECT_EQ(fp::ebMayTrap, CII->getExceptionBehavior());
   EXPECT_EQ(RoundingMode::TowardNegative, CII->getRoundingMode());
 
+  // Same as previous test for CreateConstrainedFPIntrinsic
+  Call = Builder.CreateConstrainedFPIntrinsic(
+      Intrinsic::experimental_constrained_fadd, {V->getType()}, {V, V}, nullptr,
+      "", nullptr, RoundingMode::TowardNegative, fp::ebMayTrap);
+  CII = cast<ConstrainedFPIntrinsic>(Call);
+  EXPECT_EQ(CII->getIntrinsicID(), Intrinsic::experimental_constrained_fadd);
+  EXPECT_EQ(fp::ebMayTrap, CII->getExceptionBehavior());
+  EXPECT_EQ(RoundingMode::TowardNegative, CII->getRoundingMode());
+
   Builder.CreateRetVoid();
   EXPECT_FALSE(verifyModule(*M));
 }

>From 5be756adb4e5042ac766306190fb57d3899e2a34 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Mon, 17 Mar 2025 05:46:03 -0400
Subject: [PATCH 09/19] Review changes

Use IRBuilder functions for creating Ldexp, FMA intrinsic call

Use different condition for controlling frem expansion

Update test

Fixup after merge

NaN handling fixes

Update tests
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |    4 -
 llvm/lib/CodeGen/ExpandFp.cpp                 |  103 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   18 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |    1 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll   | 1942 ++++++++---------
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   64 +-
 6 files changed, 1048 insertions(+), 1084 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c07cfcdcb939c..55fdbc67c3540 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5672,10 +5672,6 @@ class TargetLowering : public TargetLoweringBase {
                                        LoadSDNode *OriginalLoad,
                                        SelectionDAG &DAG) const;
 
-  /// Indicates whether the FRem instruction should be expanded before
-  /// ISel in the LLVM IR.
-  virtual bool shouldExpandFRemInIR() const { return false; };
-
 private:
   SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                            const SDLoc &DL, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 311dacf120d98..bc827172b3be9 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -17,6 +17,9 @@
 #include "llvm/CodeGen/ExpandFp.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/SimplifyQuery.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -24,9 +27,12 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
@@ -89,7 +95,7 @@ class FRemExpander {
   /// must match the type for which the class instance has been
   /// created. The code will be generated at the insertion point of \p
   /// B and the insertion point will be reset at exit.
-  Value *buildFRem(Value *X, Value *Y) const;
+  Value *buildFRem(Value *X, Value *Y, SimplifyQuery &SQ) const;
 
 private:
   FRemExpander(IRBuilder<> &B, Type *FremTy, short Bits, unsigned long Signbit,
@@ -98,11 +104,6 @@ class FRemExpander {
         ExTy(B.getInt32Ty()), Bits(ConstantInt::get(ExTy, Bits)),
         One(ConstantInt::get(ExTy, 1)), Signbit(Signbit) {};
 
-  Value *createLdexp(Value *Base, Value *Exp, const Twine &Name) const {
-    return B.CreateIntrinsic(Intrinsic::ldexp, {ComputeFpTy, B.getInt32Ty()},
-                             {Base, Exp}, {}, Name);
-  }
-
   Value *createRcp(Value *V, const Twine &Name) const {
     return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
   }
@@ -118,8 +119,7 @@ class FRemExpander {
     //   ax = clt ? axp : ax;
     Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv),
                                       {}, "q");
-    Value *AxUpdate = B.CreateIntrinsic(Intrinsic::fma, {ComputeFpTy},
-                                        {B.CreateFNeg(Q), Ay, Ax}, {}, "ax");
+    Value *AxUpdate = B.CreateFMA(B.CreateFNeg(Q), Ay, Ax, {}, "ax");
     Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate,
                               ConstantFP::get(ComputeFpTy, 0.0), "clt");
     Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp");
@@ -145,7 +145,7 @@ class FRemExpander {
     Value *Exp = B.CreateExtractValue(Frexp, {1});
 
     Exp = B.CreateSub(Exp, One, ExName);
-    Value *Pow = createLdexp(Mant, NewExp, PowName);
+    Value *Pow = B.CreateLdexp(Mant, NewExp, {}, PowName);
 
     return {Pow, Exp};
   }
@@ -194,7 +194,7 @@ class FRemExpander {
     AxPhi->addIncoming(Ax, PreheaderBB);
 
     Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv);
-    AxPhiUpdate = createLdexp(AxPhiUpdate, Bits, "ax_update");
+    AxPhiUpdate = B.CreateLdexp(AxPhiUpdate, Bits, {}, "ax_update");
     AxPhi->addIncoming(AxPhiUpdate, LoopBB);
     NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB);
 
@@ -212,14 +212,14 @@ class FRemExpander {
     NbExitPhi->addIncoming(NbIv, LoopBB);
     NbExitPhi->addIncoming(Nb, PreheaderBB);
 
-    Value *AxFinal = createLdexp(
-        AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), "ax");
+    Value *AxFinal = B.CreateLdexp(
+        AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), {}, "ax");
     AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
 
     // Build:
     //    ax = BUILTIN_FLDEXP_ComputeFpTy(ax, ey);
     //    ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax));
-    AxFinal = createLdexp(AxFinal, Ey, "ax");
+    AxFinal = B.CreateLdexp(AxFinal, Ey, {}, "ax");
 
     Value *XAsInt = B.CreateBitCast(X, IntTy, "x_as_int");
     if (ComputeFpTy != X->getType())
@@ -249,28 +249,32 @@ class FRemExpander {
     RetPhi->addIncoming(Ret, B.GetInsertBlock());
   }
 
-  /// Adjust the result of the main computation from the FRem expansion
-  /// if NaNs or infinite values are possible.
-  Value *buildNanAndInfHandling(Value *Ret, Value *X, Value *Y) const {
+  /// Return a value that is NaN if one of the corner cases concerning
+  /// the inputs \p X and \p Y is detected, and \p Ret otherwise.
+  Value *handleInputCornerCases(Value *Ret, Value *X,
+                                Value *Y, SimplifyQuery &SQ) const {
     // Build:
     //   ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
     //   bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
     //   BUILTIN_ISFINITE_ComputeFpTy(x);
     //   ret = c ? ret : QNAN_ComputeFpTy;
-    // TODO Handle NaN and infinity fast math flags separately here?
     Value *Nan = ConstantFP::getQNaN(FremTy);
-
-    Ret = B.CreateSelect(B.createIsFPClass(Y, FPClassTest::fcZero), Nan, Ret);
-    Value *C = B.CreateLogicalAnd(
-        B.CreateNot(B.createIsFPClass(Y, FPClassTest::fcNan)),
-        B.createIsFPClass(X, FPClassTest::fcFinite));
+    Ret = B.CreateSelect(B.CreateFCmpOEQ(Y, ConstantFP::get(FremTy, 0.0)), Nan,
+                         Ret);
+    FPClassTest NotNan = FPClassTest::fcInf | FPClassTest::fcFinite;
+    Value *YNotNan =
+        isKnownNeverNaN(Y, 0, SQ) ? B.getTrue() : B.createIsFPClass(Y, NotNan);
+    Value *XFinite = isKnownNeverInfinity(X, 0, SQ)
+                         ? B.getTrue()
+                         : B.createIsFPClass(X, FPClassTest::fcFinite);
+    Value *C = B.CreateLogicalAnd(YNotNan, XFinite);
     Ret = B.CreateSelect(C, Ret, Nan);
 
     return Ret;
   }
 };
 
-Value *FRemExpander::buildFRem(Value *X, Value *Y) const {
+  Value *FRemExpander::buildFRem(Value *X, Value *Y, SimplifyQuery &SQ) const {
   assert(X->getType() == FremTy && Y->getType() == FremTy);
 
   FastMathFlags FMF = B.getFastMathFlags();
@@ -293,8 +297,10 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y) const {
   PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret");
   Value *Ret = RetPhi;
 
-  if (!FMF.noNaNs() || !FMF.noInfs())
-    Ret = buildNanAndInfHandling(Ret, X, Y);
+  // We would return NaN in all corner cases handled here.
+  // Hence, if NaNs are excluded, keep the result as it is.
+  if (!FMF.noNaNs())
+    Ret = handleInputCornerCases(Ret, X, Y, SQ);
 
   Function *Fun = B.GetInsertBlock()->getParent();
   auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
@@ -352,7 +358,7 @@ static bool shouldSkipExpandFRem(BinaryOperator &I) {
          isConstOrConstSelectOp(I.getOperand(1));
 }
 
-static bool expandFRem(BinaryOperator &I) {
+static bool expandFRem(BinaryOperator &I, SimplifyQuery &SQ) {
   LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
   if (shouldSkipExpandFRem(I)) {
     LLVM_DEBUG(
@@ -384,7 +390,7 @@ static bool expandFRem(BinaryOperator &I) {
 
   Value *Ret;
   if (ReturnTy->isFloatingPointTy())
-    Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1));
+    Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1), SQ);
   else {
     auto *VecTy = cast<FixedVectorType>(ReturnTy);
 
@@ -398,7 +404,7 @@ static bool expandFRem(BinaryOperator &I) {
     for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
       Value *Num = B.CreateExtractElement(Nums, I);
       Value *Denum = B.CreateExtractElement(Denums, I);
-      Value *Rem = Expander->buildFRem(Num, Denum);
+      Value *Rem = Expander->buildFRem(Num, Denum, SQ);
       Ret = B.CreateInsertElement(Ret, Rem, I);
     }
   }
@@ -963,6 +969,36 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
   I->eraseFromParent();
 }
 
+// This covers all floating point types; more than we need here.
+// TODO Move somewhere else for general use?
+/// Return the Libcall for a frem instruction of
+/// type \p Ty.
+static RTLIB::Libcall fremToLibcall(Type *Ty) {
+  assert(Ty->isFloatingPointTy());
+  if (Ty->isFloatTy() || Ty->is16bitFPTy())
+    return RTLIB::REM_F32;
+  if (Ty->isDoubleTy())
+    return RTLIB::REM_F64;
+  if (Ty->isFP128Ty())
+    return RTLIB::REM_F128;
+  if (Ty->isX86_FP80Ty())
+    return RTLIB::REM_F80;
+  if (Ty->isPPC_FP128Ty())
+    return RTLIB::REM_PPCF128;
+
+  llvm_unreachable("Unknown floating point type");
+}
+
+/* Return true if, according to \p LibInfo, the target either directly
+   supports the frem instruction for the \p Ty, has a custom lowering,
+   or uses a libcall. */
+static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
+  if (!TLI.isOperationExpand(ISD::FREM, EVT::getEVT(Ty)))
+    return true;
+
+  return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
+}
+
 static bool runImpl(Function &F, const TargetLowering &TLI) {
   SmallVector<Instruction *, 4> Replace;
   SmallVector<Instruction *, 4> ReplaceVector;
@@ -979,7 +1015,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
   for (auto &I : instructions(F)) {
     switch (I.getOpcode()) {
     case Instruction::FRem:
-      if (TLI.shouldExpandFRemInIR()) {
+      if (!targetSupportsFrem(TLI, I.getType())) {
         Replace.push_back(&I);
         Modified = true;
       }
@@ -1034,10 +1070,11 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
   while (!Replace.empty()) {
     Instruction *I = Replace.pop_back_val();
-    if (I->getOpcode() == Instruction::FRem)
-      expandFRem(cast<BinaryOperator>(*I));
-    else if (I->getOpcode() == Instruction::FPToUI ||
-             I->getOpcode() == Instruction::FPToSI) {
+    if (I->getOpcode() == Instruction::FRem) {
+      auto SQ = SimplifyQuery{I->getModule()->getDataLayout(), I};
+      expandFRem(cast<BinaryOperator>(*I), SQ);
+    } else if (I->getOpcode() == Instruction::FPToUI ||
+               I->getOpcode() == Instruction::FPToSI) {
       expandFPToI(I);
     } else {
       expandIToFP(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ade81f17ecca5..d1250e8bd3a84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -411,7 +411,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
                      Expand);
 
-  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
+  setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
 
   if (Subtarget->has16BitInsts())
     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
@@ -1424,7 +1424,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
-  case ISD::FREM: return LowerFREM(Op, DAG);
   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   case ISD::FRINT: return LowerFRINT(Op, DAG);
@@ -2393,21 +2392,6 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   return DAG.getMergeValues(Res, DL);
 }
 
-// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
-SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  EVT VT = Op.getValueType();
-  auto Flags = Op->getFlags();
-  SDValue X = Op.getOperand(0);
-  SDValue Y = Op.getOperand(1);
-
-  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
-  SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
-  // TODO: For f32 use FMAD instead if !hasFastFMA32?
-  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
-}
-
 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index ab955e7eee2a3..c74dc7942f52c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -387,7 +387,6 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
-  bool shouldExpandFRemInIR() const override { return true; };
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e40d9690d832b..a0af9698cb7a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -87,19 +87,18 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
 ; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; CI-NEXT:  .LBB0_8: ; %Flow19
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
 ; CI-NEXT:    s_and_b32 s3, 0xffff, s3
-; CI-NEXT:    s_cmp_eq_u32 s3, 0
-; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_cmpk_lt_u32 s3, 0x7c01
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, 0
 ; CI-NEXT:    s_and_b32 s2, 0xffff, s2
 ; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
 ; CI-NEXT:    s_cselect_b32 s2, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
-; CI-NEXT:    s_cselect_b32 s3, 1, 0
 ; CI-NEXT:    s_and_b32 s2, s3, s2
-; CI-NEXT:    s_and_b32 s3, 1, s4
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v1, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7e00
 ; CI-NEXT:    s_and_b32 s2, 1, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -113,23 +112,23 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-LABEL: frem_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x8
+; VI-NEXT:    s_load_dword s2, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x8
+; VI-NEXT:    s_mov_b32 s1, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
-; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s2|
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v0
 ; VI-NEXT:    s_cbranch_vccz .LBB0_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-NEXT:    s_mov_b32 s1, 0
 ; VI-NEXT:  .LBB0_2: ; %Flow18
@@ -192,17 +191,16 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v1, s1, v0
 ; VI-NEXT:  .LBB0_8: ; %Flow19
-; VI-NEXT:    v_mov_b32_e32 v0, 0x60
-; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
-; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT:    v_mov_b32_e32 v0, 0x3fc
+; VI-NEXT:    v_cmp_eq_f16_e64 vcc, s0, 0
 ; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v0
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
-; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
@@ -477,19 +475,18 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
 ; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; CI-NEXT:  .LBB2_8: ; %Flow19
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
 ; CI-NEXT:    s_and_b32 s3, 0xffff, s3
-; CI-NEXT:    s_cmp_eq_u32 s3, 0
-; CI-NEXT:    s_cselect_b32 s4, 1, 0
+; CI-NEXT:    s_cmpk_lt_u32 s3, 0x7c01
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, 0
 ; CI-NEXT:    s_and_b32 s2, 0xffff, s2
 ; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
 ; CI-NEXT:    s_cselect_b32 s2, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
-; CI-NEXT:    s_cselect_b32 s3, 1, 0
 ; CI-NEXT:    s_and_b32 s2, s3, s2
-; CI-NEXT:    s_and_b32 s3, 1, s4
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v1, v2
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7e00
 ; CI-NEXT:    s_and_b32 s2, 1, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -503,23 +500,23 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; VI-LABEL: unsafe_frem_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x8
+; VI-NEXT:    s_load_dword s2, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x8
+; VI-NEXT:    s_mov_b32 s1, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
-; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s2|
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; VI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v0
 ; VI-NEXT:    s_cbranch_vccz .LBB2_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-NEXT:    s_mov_b32 s1, 0
 ; VI-NEXT:  .LBB2_2: ; %Flow18
@@ -570,17 +567,16 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v1, s1, v0
 ; VI-NEXT:  .LBB2_8: ; %Flow19
-; VI-NEXT:    v_mov_b32_e32 v0, 0x60
-; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
-; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT:    v_mov_b32_e32 v0, 0x3fc
+; VI-NEXT:    v_cmp_eq_f16_e64 vcc, s0, 0
 ; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v0
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
-; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
@@ -598,37 +594,37 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
-; CI-NEXT:    s_mov_b32 s4, 1
+; CI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; CI-NEXT:    s_load_dword s1, s[2:3], 0x4
+; CI-NEXT:    s_mov_b32 s2, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s3
-; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    v_mov_b32_e32 v0, s1
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s0|, |v0|
 ; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_cbranch_vccz .LBB3_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s0|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB3_2: ; %Flow16
-; CI-NEXT:    s_xor_b32 s4, s4, 1
-; CI-NEXT:    s_and_b32 s4, s4, 1
-; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB3_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
-; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s1|
 ; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
-; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
-; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s0|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s0|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s1|
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
 ; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
@@ -676,57 +672,56 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; CI-NEXT:  .LBB3_8: ; %Flow17
-; CI-NEXT:    v_mov_b32_e32 v1, 0x60
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_mov_b32_e32 v2, 0x3fc
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s1, v2
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s0, v2
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
-; VI-NEXT:    s_mov_b32 s4, 1
+; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s1, s[2:3], 0x10
+; VI-NEXT:    s_mov_b32 s2, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s0|, |v0|
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_cbranch_vccz .LBB3_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s0|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB3_2: ; %Flow16
-; VI-NEXT:    s_xor_b32 s4, s4, 1
-; VI-NEXT:    s_and_b32 s4, s4, 1
-; VI-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB3_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s1|
 ; VI-NEXT:    v_ldexp_f32 v1, v1, 1
-; VI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
-; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s0|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s0|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s1|
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
 ; VI-NEXT:    v_ldexp_f32 v4, v0, 12
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
@@ -774,21 +769,20 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
-; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; VI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; VI-NEXT:  .LBB3_8: ; %Flow17
-; VI-NEXT:    v_mov_b32_e32 v1, 0x60
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3fc
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s1, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s0, v2
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -986,35 +980,35 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 {
 ; CI-LABEL: unsafe_frem_f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
-; CI-NEXT:    s_mov_b32 s4, 1
+; CI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; CI-NEXT:    s_load_dword s1, s[2:3], 0x4
+; CI-NEXT:    s_mov_b32 s2, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s3
-; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    v_mov_b32_e32 v0, s1
+; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s0|, |v0|
 ; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_cbranch_vccz .LBB5_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s0|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB5_2: ; %Flow16
-; CI-NEXT:    s_xor_b32 s4, s4, 1
-; CI-NEXT:    s_and_b32 s4, s4, 1
-; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB5_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
-; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
-; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s0|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s0|
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s1|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s1|
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
 ; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
@@ -1052,55 +1046,54 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; CI-NEXT:  .LBB5_8: ; %Flow17
-; CI-NEXT:    v_mov_b32_e32 v1, 0x60
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_mov_b32_e32 v2, 0x3fc
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s1, v2
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s0, v2
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: unsafe_frem_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_load_dword s3, s[4:5], 0x10
-; VI-NEXT:    s_mov_b32 s4, 1
+; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s1, s[2:3], 0x10
+; VI-NEXT:    s_mov_b32 s2, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s0|, |v0|
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_cbranch_vccz .LBB5_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s0|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB5_2: ; %Flow16
-; VI-NEXT:    s_xor_b32 s4, s4, 1
-; VI-NEXT:    s_and_b32 s4, s4, 1
-; VI-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-NEXT:    s_xor_b32 s2, s2, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB5_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
-; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s0|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s0|
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s1|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s1|
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
 ; VI-NEXT:    v_ldexp_f32 v4, v0, 12
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
@@ -1138,21 +1131,20 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
-; VI-NEXT:    s_and_b32 s4, s2, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; VI-NEXT:    s_and_b32 s2, s0, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; VI-NEXT:  .LBB5_8: ; %Flow17
-; VI-NEXT:    v_mov_b32_e32 v1, 0x60
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v1
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
-; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3fc
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v2
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s3, 3
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s1, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s0, v2
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -1166,48 +1158,48 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[10:11], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[0:1]|, |v[0:1]|
 ; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CI-NEXT:    s_cbranch_vccz .LBB6_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[0:1]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_brev_b32 s5, 1
+; CI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
-; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    s_brev_b32 s7, 1
-; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
-; CI-NEXT:    v_mov_b32_e32 v0, s6
-; CI-NEXT:    v_mov_b32_e32 v1, s7
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s4, 0
 ; CI-NEXT:  .LBB6_2: ; %Flow16
-; CI-NEXT:    s_xor_b32 s6, s6, 1
-; CI-NEXT:    s_and_b32 s6, s6, 1
-; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB6_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
-; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[2:3]|
 ; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
-; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
 ; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
 ; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
 ; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
-; CI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; CI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; CI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
 ; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
@@ -1245,9 +1237,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:  .LBB6_7: ; %frem.loop_exit
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
 ; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
-; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    s_brev_b32 s7, 1
-; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_brev_b32 s5, 1
+; CI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
 ; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
@@ -1256,70 +1248,69 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
-; CI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s5, v1
 ; CI-NEXT:  .LBB6_8: ; %Flow17
-; CI-NEXT:    v_mov_b32_e32 v2, 0x60
-; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, s[2:3], 0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; CI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT:    v_mov_b32_e32 v3, 0x3fc
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
 ; CI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
-; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[0:1], v3
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s6, 1
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s4, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[10:11], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[0:1]|, |v[0:1]|
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; VI-NEXT:    s_cbranch_vccz .LBB6_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[0:1]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_brev_b32 s5, 1
+; VI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
-; VI-NEXT:    s_mov_b32 s6, 0
-; VI-NEXT:    s_brev_b32 s7, 1
-; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:  .LBB6_2: ; %Flow16
-; VI-NEXT:    s_xor_b32 s6, s6, 1
-; VI-NEXT:    s_and_b32 s6, s6, 1
-; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_xor_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB6_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[2:3]|
 ; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
-; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
 ; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
 ; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
-; VI-NEXT:    v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
 ; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
 ; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
@@ -1357,9 +1348,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:  .LBB6_7: ; %frem.loop_exit
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
 ; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
-; VI-NEXT:    s_mov_b32 s6, 0
-; VI-NEXT:    s_brev_b32 s7, 1
-; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_brev_b32 s5, 1
+; VI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
 ; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
@@ -1368,23 +1359,22 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; VI-NEXT:    v_xor_b32_e32 v0, s6, v0
-; VI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; VI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s5, v1
 ; VI-NEXT:  .LBB6_8: ; %Flow17
-; VI-NEXT:    v_mov_b32_e32 v2, 0x60
-; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, s[2:3], 0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3fc
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
-; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[0:1], v3
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s8
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s9
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -1604,43 +1594,43 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; CI-LABEL: unsafe_frem_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[10:11], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[0:1]|, |v[0:1]|
 ; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CI-NEXT:    s_cbranch_vccz .LBB8_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[0:1]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_brev_b32 s5, 1
+; CI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
-; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    s_brev_b32 s7, 1
-; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
-; CI-NEXT:    v_mov_b32_e32 v0, s6
-; CI-NEXT:    v_mov_b32_e32 v1, s7
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s4, 0
 ; CI-NEXT:  .LBB8_2: ; %Flow16
-; CI-NEXT:    s_xor_b32 s6, s6, 1
-; CI-NEXT:    s_and_b32 s6, s6, 1
-; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB8_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
-; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[2:3]|
 ; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
-; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
 ; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
 ; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
@@ -1679,9 +1669,9 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:  .LBB8_7: ; %frem.loop_exit
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
 ; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
-; CI-NEXT:    s_mov_b32 s6, 0
-; CI-NEXT:    s_brev_b32 s7, 1
-; CI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_brev_b32 s5, 1
+; CI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
 ; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
@@ -1690,65 +1680,64 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
-; CI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s5, v1
 ; CI-NEXT:  .LBB8_8: ; %Flow17
-; CI-NEXT:    v_mov_b32_e32 v2, 0x60
-; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, s[2:3], 0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; CI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT:    v_mov_b32_e32 v3, 0x3fc
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
 ; CI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
-; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[0:1], v3
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: unsafe_frem_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s6, 1
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT:    s_mov_b32 s4, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[10:11], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[0:1]|, |v[0:1]|
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; VI-NEXT:    s_cbranch_vccz .LBB8_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[0:1]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_brev_b32 s5, 1
+; VI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
-; VI-NEXT:    s_mov_b32 s6, 0
-; VI-NEXT:    s_brev_b32 s7, 1
-; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:  .LBB8_2: ; %Flow16
-; VI-NEXT:    s_xor_b32 s6, s6, 1
-; VI-NEXT:    s_and_b32 s6, s6, 1
-; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_xor_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB8_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[2:3]|
 ; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
-; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
 ; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
@@ -1787,9 +1776,9 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:  .LBB8_7: ; %frem.loop_exit
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
 ; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
-; VI-NEXT:    s_mov_b32 s6, 0
-; VI-NEXT:    s_brev_b32 s7, 1
-; VI-NEXT:    s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    s_brev_b32 s5, 1
+; VI-NEXT:    s_and_b64 s[4:5], s[0:1], s[4:5]
 ; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
 ; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
@@ -1798,23 +1787,22 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; VI-NEXT:    v_xor_b32_e32 v0, s6, v0
-; VI-NEXT:    v_xor_b32_e32 v1, s7, v1
+; VI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s5, v1
 ; VI-NEXT:  .LBB8_8: ; %Flow17
-; VI-NEXT:    v_mov_b32_e32 v2, 0x60
-; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, s[2:3], 0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x7ff80000
-; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3fc
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[2:3], v3
-; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[4:5], 3
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[0:1], v3
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s8
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s9
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
@@ -1828,35 +1816,35 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xd
 ; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    s_load_dword s3, s[4:5], 0x4
-; CI-NEXT:    s_mov_b32 s4, 1
+; CI-NEXT:    s_load_dword s0, s[10:11], 0x0
+; CI-NEXT:    s_load_dword s1, s[2:3], 0x4
+; CI-NEXT:    s_mov_b32 s2, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s1|
 ; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
 ; CI-NEXT:    s_cbranch_vccz .LBB9_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    s_and_b32 s2, s0, 0xffff8000
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB9_2: ; %Flow60
-; CI-NEXT:    s_xor_b32 s4, s4, 1
-; CI-NEXT:    s_and_b32 s4, s4, 1
-; CI-NEXT:    s_cmp_lg_u32 s4, 0
+; CI-NEXT:    s_xor_b32 s2, s2, 1
+; CI-NEXT:    s_and_b32 s2, s2, 1
+; CI-NEXT:    s_cmp_lg_u32 s2, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB9_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
 ; CI-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
 ; CI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
-; CI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e32 v0, v2
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
@@ -1907,34 +1895,34 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
-; CI-NEXT:    v_xor_b32_e32 v0, s4, v0
+; CI-NEXT:    s_and_b32 s2, s0, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; CI-NEXT:  .LBB9_8: ; %Flow61
-; CI-NEXT:    s_lshr_b32 s4, s2, 16
-; CI-NEXT:    s_lshr_b32 s5, s3, 16
-; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s4|
-; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s5|
-; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    s_lshr_b32 s2, s0, 16
+; CI-NEXT:    s_lshr_b32 s3, s1, 16
+; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s3|
+; CI-NEXT:    s_mov_b32 s4, 1
 ; CI-NEXT:    ; implicit-def: $vgpr1
 ; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
 ; CI-NEXT:    s_cbranch_vccz .LBB9_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else20
-; CI-NEXT:    s_and_b32 s6, s4, 0xffff8000
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s6
-; CI-NEXT:    v_mov_b32_e32 v4, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s4, 0
 ; CI-NEXT:  .LBB9_10: ; %Flow56
-; CI-NEXT:    s_xor_b32 s6, s6, 1
-; CI-NEXT:    s_and_b32 s6, s6, 1
-; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_xor_b32 s4, s4, 1
+; CI-NEXT:    s_and_b32 s4, s4, 1
+; CI-NEXT:    s_cmp_lg_u32 s4, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB9_16
 ; CI-NEXT:  ; %bb.11: ; %frem.compute19
 ; CI-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v2
 ; CI-NEXT:    v_ldexp_f32_e64 v2, v4, 1
-; CI-NEXT:    v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e32 v1, v3
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v3
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
@@ -1985,72 +1973,69 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    s_and_b32 s6, s4, 0xffff8000
-; CI-NEXT:    v_xor_b32_e32 v1, s6, v1
+; CI-NEXT:    s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; CI-NEXT:  .LBB9_16: ; %Flow57
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, s1
+; CI-NEXT:    s_and_b32 s1, s1, 0x7fff
+; CI-NEXT:    s_and_b32 s1, 0xffff, s1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, 0
+; CI-NEXT:    s_cmpk_lt_u32 s1, 0x7c01
+; CI-NEXT:    s_cselect_b32 s1, 1, 0
+; CI-NEXT:    s_and_b32 s0, s0, 0x7fff
+; CI-NEXT:    s_and_b32 s0, 0xffff, s0
+; CI-NEXT:    s_cmpk_lt_u32 s0, 0x7c00
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v3
+; CI-NEXT:    s_cselect_b32 s0, 1, 0
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s4, s1, s0
 ; CI-NEXT:    s_and_b32 s3, 0xffff, s3
-; CI-NEXT:    s_cmp_eq_u32 s3, 0
-; CI-NEXT:    s_cselect_b32 s6, 1, 0
+; CI-NEXT:    s_cmpk_lt_u32 s3, 0x7c01
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    s_and_b32 s2, 0xffff, s2
 ; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
 ; CI-NEXT:    s_cselect_b32 s2, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
-; CI-NEXT:    s_cselect_b32 s3, 1, 0
+; CI-NEXT:    v_cmp_eq_f32_e64 s[0:1], v2, v3
 ; CI-NEXT:    s_and_b32 s2, s3, s2
-; CI-NEXT:    s_and_b32 s3, s5, 0x7fff
-; CI-NEXT:    s_and_b32 s3, 0xffff, s3
-; CI-NEXT:    s_cmp_eq_u32 s3, 0
-; CI-NEXT:    s_cselect_b32 s5, 1, 0
-; CI-NEXT:    s_and_b32 s4, s4, 0x7fff
-; CI-NEXT:    s_and_b32 s4, 0xffff, s4
-; CI-NEXT:    s_cmpk_lt_u32 s4, 0x7c00
-; CI-NEXT:    s_cselect_b32 s4, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s3, 0x7c00
-; CI-NEXT:    s_cselect_b32 s3, 1, 0
-; CI-NEXT:    s_and_b32 s3, s3, s4
-; CI-NEXT:    s_and_b32 s4, 1, s6
 ; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x7e00
-; CI-NEXT:    s_and_b32 s2, 1, s2
+; CI-NEXT:    s_and_b32 s3, 1, s4
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s5
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; CI-NEXT:    s_and_b32 s0, 1, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s3
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s1, 1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s0, s[10:11], 0x0
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x10
+; VI-NEXT:    s_load_dword s2, s[10:11], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x10
+; VI-NEXT:    s_mov_b32 s1, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s0|
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s2|
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s0|
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
 ; VI-NEXT:    s_cbranch_vccz .LBB9_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; VI-NEXT:    s_mov_b32 s1, 0
 ; VI-NEXT:  .LBB9_2: ; %Flow60
@@ -2113,22 +2098,22 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s2, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; VI-NEXT:  .LBB9_8: ; %Flow61
-; VI-NEXT:    s_lshr_b32 s4, s0, 16
 ; VI-NEXT:    s_lshr_b32 s6, s2, 16
-; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s4|
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s6|
+; VI-NEXT:    s_lshr_b32 s4, s0, 16
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s6|
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s4|
 ; VI-NEXT:    s_mov_b32 s1, 1
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
 ; VI-NEXT:    s_cbranch_vccz .LBB9_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else20
-; VI-NEXT:    s_and_b32 s1, s4, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s6, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; VI-NEXT:    s_mov_b32 s1, 0
 ; VI-NEXT:  .LBB9_10: ; %Flow56
@@ -2191,23 +2176,21 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; VI-NEXT:    v_ldexp_f32 v1, v2, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT:    s_and_b32 s1, s4, 0xffff8000
+; VI-NEXT:    s_and_b32 s1, s6, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; VI-NEXT:  .LBB9_16: ; %Flow57
-; VI-NEXT:    v_mov_b32_e32 v2, 0x60
-; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3fc
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x1f8
-; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, 3
-; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v3
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s6, v2
-; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s6, 3
-; VI-NEXT:    v_cmp_class_f16_e64 s[4:5], s4, v3
-; VI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT:    v_cmp_eq_f16_e64 vcc, s0, 0
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s0, v2
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s2, v3
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    v_cmp_eq_f16_e64 s[2:3], s4, 0
+; VI-NEXT:    v_cmp_class_f16_e64 s[4:5], s4, v2
+; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s6, v3
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; VI-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -2230,35 +2213,35 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_v4f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[10:11], 0x0
+; CI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x8
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s2|
-; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s4|
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s4|
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s6|
 ; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
 ; CI-NEXT:    s_cbranch_vccz .LBB10_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_and_b32 s6, s2, 0xffff8000
+; CI-NEXT:    s_and_b32 s0, s4, 0xffff8000
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
-; CI-NEXT:    v_mov_b32_e32 v0, s6
-; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v3, s4
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB10_2: ; %Flow144
-; CI-NEXT:    s_xor_b32 s6, s6, 1
-; CI-NEXT:    s_and_b32 s6, s6, 1
-; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB10_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
 ; CI-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
 ; CI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
-; CI-NEXT:    v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e32 v0, v2
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v2
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
@@ -2309,34 +2292,34 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    s_and_b32 s6, s2, 0xffff8000
-; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; CI-NEXT:    s_and_b32 s0, s4, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CI-NEXT:  .LBB10_8: ; %Flow145
-; CI-NEXT:    s_lshr_b32 s6, s2, 16
-; CI-NEXT:    s_lshr_b32 s7, s4, 16
-; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s6|
-; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s7|
-; CI-NEXT:    s_mov_b32 s8, 1
+; CI-NEXT:    s_lshr_b32 s2, s4, 16
+; CI-NEXT:    s_lshr_b32 s3, s6, 16
+; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s2|
+; CI-NEXT:    v_cvt_f32_f16_e64 v2, |s3|
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr1
 ; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
 ; CI-NEXT:    s_cbranch_vccz .LBB10_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else20
-; CI-NEXT:    s_and_b32 s8, s6, 0xffff8000
+; CI-NEXT:    s_and_b32 s0, s2, 0xffff8000
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
-; CI-NEXT:    v_mov_b32_e32 v1, s8
-; CI-NEXT:    v_mov_b32_e32 v4, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CI-NEXT:    s_mov_b32 s8, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB10_10: ; %Flow140
-; CI-NEXT:    s_xor_b32 s8, s8, 1
-; CI-NEXT:    s_and_b32 s8, s8, 1
-; CI-NEXT:    s_cmp_lg_u32 s8, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB10_16
 ; CI-NEXT:  ; %bb.11: ; %frem.compute19
 ; CI-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v2
 ; CI-NEXT:    v_ldexp_f32_e64 v2, v4, 1
-; CI-NEXT:    v_div_scale_f32 v4, s[8:9], v2, v2, 1.0
+; CI-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e32 v1, v3
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v3
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
@@ -2387,32 +2370,32 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    s_and_b32 s8, s6, 0xffff8000
-; CI-NEXT:    v_xor_b32_e32 v1, s8, v1
+; CI-NEXT:    s_and_b32 s0, s2, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; CI-NEXT:  .LBB10_16: ; %Flow141
-; CI-NEXT:    v_cvt_f32_f16_e64 v4, |s3|
-; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s5|
-; CI-NEXT:    s_mov_b32 s8, 1
+; CI-NEXT:    v_cvt_f32_f16_e64 v4, |s5|
+; CI-NEXT:    v_cvt_f32_f16_e64 v3, |s7|
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr2
 ; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
 ; CI-NEXT:    s_cbranch_vccz .LBB10_18
 ; CI-NEXT:  ; %bb.17: ; %frem.else56
-; CI-NEXT:    s_and_b32 s8, s3, 0xffff8000
+; CI-NEXT:    s_and_b32 s0, s5, 0xffff8000
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
-; CI-NEXT:    v_mov_b32_e32 v2, s8
-; CI-NEXT:    v_mov_b32_e32 v5, s3
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v5, s5
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; CI-NEXT:    s_mov_b32 s8, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB10_18: ; %Flow136
-; CI-NEXT:    s_xor_b32 s8, s8, 1
-; CI-NEXT:    s_and_b32 s8, s8, 1
-; CI-NEXT:    s_cmp_lg_u32 s8, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB10_24
 ; CI-NEXT:  ; %bb.19: ; %frem.compute55
 ; CI-NEXT:    v_frexp_mant_f32_e32 v5, v3
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
 ; CI-NEXT:    v_ldexp_f32_e64 v3, v5, 1
-; CI-NEXT:    v_div_scale_f32 v5, s[8:9], v3, v3, 1.0
+; CI-NEXT:    v_div_scale_f32 v5, s[0:1], v3, v3, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e32 v2, v4
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v7
@@ -2463,34 +2446,34 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    s_and_b32 s8, s3, 0xffff8000
-; CI-NEXT:    v_xor_b32_e32 v2, s8, v2
+; CI-NEXT:    s_and_b32 s0, s5, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; CI-NEXT:  .LBB10_24: ; %Flow137
-; CI-NEXT:    s_lshr_b32 s8, s3, 16
-; CI-NEXT:    s_lshr_b32 s9, s5, 16
-; CI-NEXT:    v_cvt_f32_f16_e64 v5, |s8|
-; CI-NEXT:    v_cvt_f32_f16_e64 v4, |s9|
-; CI-NEXT:    s_mov_b32 s10, 1
+; CI-NEXT:    s_lshr_b32 s10, s5, 16
+; CI-NEXT:    s_lshr_b32 s11, s7, 16
+; CI-NEXT:    v_cvt_f32_f16_e64 v5, |s10|
+; CI-NEXT:    v_cvt_f32_f16_e64 v4, |s11|
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr3
 ; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v5, v4
 ; CI-NEXT:    s_cbranch_vccz .LBB10_26
 ; CI-NEXT:  ; %bb.25: ; %frem.else92
-; CI-NEXT:    s_and_b32 s10, s8, 0xffff8000
+; CI-NEXT:    s_and_b32 s0, s10, 0xffff8000
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v5, v4
-; CI-NEXT:    v_mov_b32_e32 v3, s10
-; CI-NEXT:    v_mov_b32_e32 v6, s8
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    v_mov_b32_e32 v6, s10
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB10_26: ; %Flow132
-; CI-NEXT:    s_xor_b32 s10, s10, 1
-; CI-NEXT:    s_and_b32 s10, s10, 1
-; CI-NEXT:    s_cmp_lg_u32 s10, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB10_32
 ; CI-NEXT:  ; %bb.27: ; %frem.compute91
 ; CI-NEXT:    v_frexp_mant_f32_e32 v6, v4
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v4
 ; CI-NEXT:    v_ldexp_f32_e64 v4, v6, 1
-; CI-NEXT:    v_div_scale_f32 v6, s[10:11], v4, v4, 1.0
+; CI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e32 v3, v5
 ; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v5
 ; CI-NEXT:    v_add_i32_e32 v5, vcc, -1, v8
@@ -2541,89 +2524,82 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v3, v4, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    s_and_b32 s10, s8, 0xffff8000
-; CI-NEXT:    v_xor_b32_e32 v3, s10, v3
+; CI-NEXT:    s_and_b32 s0, s10, 0xffff8000
+; CI-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; CI-NEXT:  .LBB10_32: ; %Flow133
-; CI-NEXT:    s_and_b32 s4, s4, 0x7fff
-; CI-NEXT:    s_and_b32 s4, 0xffff, s4
-; CI-NEXT:    s_cmp_eq_u32 s4, 0
-; CI-NEXT:    s_cselect_b32 s10, 1, 0
+; CI-NEXT:    s_and_b32 s0, s6, 0x7fff
+; CI-NEXT:    s_and_b32 s0, 0xffff, s0
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, 0
+; CI-NEXT:    s_cmpk_lt_u32 s0, 0x7c01
+; CI-NEXT:    s_cselect_b32 s0, 1, 0
+; CI-NEXT:    s_and_b32 s1, s4, 0x7fff
+; CI-NEXT:    s_and_b32 s1, 0xffff, s1
+; CI-NEXT:    s_cmpk_lt_u32 s1, 0x7c00
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v5
+; CI-NEXT:    s_cselect_b32 s1, 1, 0
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, s3
+; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
+; CI-NEXT:    s_and_b32 s6, s0, s1
+; CI-NEXT:    s_and_b32 s3, 0xffff, s3
+; CI-NEXT:    s_cmpk_lt_u32 s3, 0x7c01
+; CI-NEXT:    s_cselect_b32 s3, 1, 0
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    s_and_b32 s2, 0xffff, s2
 ; CI-NEXT:    s_cmpk_lt_u32 s2, 0x7c00
 ; CI-NEXT:    s_cselect_b32 s2, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s4, 0x7c00
-; CI-NEXT:    s_cselect_b32 s4, 1, 0
-; CI-NEXT:    s_and_b32 s2, s4, s2
 ; CI-NEXT:    s_and_b32 s4, s7, 0x7fff
+; CI-NEXT:    s_and_b32 s12, s3, s2
 ; CI-NEXT:    s_and_b32 s4, 0xffff, s4
-; CI-NEXT:    s_cmp_eq_u32 s4, 0
-; CI-NEXT:    s_cselect_b32 s7, 1, 0
-; CI-NEXT:    s_and_b32 s6, s6, 0x7fff
-; CI-NEXT:    s_and_b32 s6, 0xffff, s6
-; CI-NEXT:    s_cmpk_lt_u32 s6, 0x7c00
-; CI-NEXT:    s_cselect_b32 s6, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s4, 0x7c00
+; CI-NEXT:    v_cmp_eq_f32_e64 s[0:1], v4, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, s7
+; CI-NEXT:    s_cmpk_lt_u32 s4, 0x7c01
 ; CI-NEXT:    s_cselect_b32 s4, 1, 0
 ; CI-NEXT:    s_and_b32 s5, s5, 0x7fff
-; CI-NEXT:    s_and_b32 s4, s4, s6
-; CI-NEXT:    s_and_b32 s5, 0xffff, s5
-; CI-NEXT:    s_cmp_eq_u32 s5, 0
-; CI-NEXT:    s_cselect_b32 s6, 1, 0
-; CI-NEXT:    s_and_b32 s3, s3, 0x7fff
-; CI-NEXT:    s_and_b32 s3, 0xffff, s3
-; CI-NEXT:    s_cmpk_lt_u32 s3, 0x7c00
-; CI-NEXT:    s_cselect_b32 s3, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s5, 0x7c00
-; CI-NEXT:    s_cselect_b32 s5, 1, 0
-; CI-NEXT:    s_and_b32 s3, s5, s3
-; CI-NEXT:    s_and_b32 s5, s9, 0x7fff
 ; CI-NEXT:    s_and_b32 s5, 0xffff, s5
-; CI-NEXT:    s_cmp_eq_u32 s5, 0
-; CI-NEXT:    s_cselect_b32 s9, 1, 0
-; CI-NEXT:    s_and_b32 s8, s8, 0x7fff
-; CI-NEXT:    s_and_b32 s8, 0xffff, s8
-; CI-NEXT:    s_cmpk_lt_u32 s8, 0x7c00
-; CI-NEXT:    s_cselect_b32 s8, 1, 0
-; CI-NEXT:    s_cmpk_le_u32 s5, 0x7c00
+; CI-NEXT:    s_cmpk_lt_u32 s5, 0x7c00
+; CI-NEXT:    v_cmp_eq_f32_e64 s[2:3], v4, v5
 ; CI-NEXT:    s_cselect_b32 s5, 1, 0
-; CI-NEXT:    s_and_b32 s5, s5, s8
-; CI-NEXT:    s_and_b32 s8, 1, s10
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, s11
+; CI-NEXT:    s_and_b32 s11, s11, 0x7fff
+; CI-NEXT:    s_and_b32 s7, s4, s5
+; CI-NEXT:    s_and_b32 s11, 0xffff, s11
+; CI-NEXT:    s_cmpk_lt_u32 s11, 0x7c01
+; CI-NEXT:    s_cselect_b32 s11, 1, 0
+; CI-NEXT:    s_and_b32 s10, s10, 0x7fff
+; CI-NEXT:    s_and_b32 s10, 0xffff, s10
+; CI-NEXT:    s_cmpk_lt_u32 s10, 0x7c00
+; CI-NEXT:    v_cmp_eq_f32_e64 s[4:5], v4, v5
+; CI-NEXT:    s_cselect_b32 s10, 1, 0
 ; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
 ; CI-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; CI-NEXT:    s_and_b32 s2, 1, s2
+; CI-NEXT:    s_and_b32 s6, 1, s6
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s7
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s6
+; CI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; CI-NEXT:    s_and_b32 s0, 1, s12
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s4
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT:    s_and_b32 s2, 1, s6
+; CI-NEXT:    s_and_b32 s10, s11, s10
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s3
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s9
-; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    s_and_b32 s0, 1, s7
+; CI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; CI-NEXT:    s_and_b32 s2, 1, s5
-; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT:    s_and_b32 s0, 1, s10
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; CI-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f16:
@@ -2632,19 +2608,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[18:19], 0x0
-; VI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x20
+; VI-NEXT:    s_load_dwordx2 s[10:11], s[18:19], 0x0
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
 ; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s8|
-; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s10|
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s10|
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |s8|
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v2, v1
 ; VI-NEXT:    s_cbranch_vccz .LBB10_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s0, s8, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s10, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s10
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB10_2: ; %Flow144
@@ -2707,22 +2683,22 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT:    s_and_b32 s0, s8, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s10, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; VI-NEXT:  .LBB10_8: ; %Flow145
-; VI-NEXT:    s_lshr_b32 s4, s8, 16
 ; VI-NEXT:    s_lshr_b32 s6, s10, 16
-; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s4|
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s6|
+; VI-NEXT:    s_lshr_b32 s4, s8, 16
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s6|
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |s4|
 ; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v3, v2
 ; VI-NEXT:    s_cbranch_vccz .LBB10_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else20
-; VI-NEXT:    s_and_b32 s0, s4, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s6, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB10_10: ; %Flow140
@@ -2785,20 +2761,20 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; VI-NEXT:    v_ldexp_f32 v1, v2, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT:    s_and_b32 s0, s4, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s6, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; VI-NEXT:  .LBB10_16: ; %Flow141
-; VI-NEXT:    v_cvt_f32_f16_e64 v4, |s9|
-; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s11|
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |s11|
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |s9|
 ; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr2
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
 ; VI-NEXT:    s_cbranch_vccz .LBB10_18
 ; VI-NEXT:  ; %bb.17: ; %frem.else56
-; VI-NEXT:    s_and_b32 s0, s9, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s11, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    v_mov_b32_e32 v5, s11
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB10_18: ; %Flow136
@@ -2861,22 +2837,22 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; VI-NEXT:    v_ldexp_f32 v2, v3, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; VI-NEXT:    s_and_b32 s0, s9, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s11, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; VI-NEXT:  .LBB10_24: ; %Flow137
-; VI-NEXT:    s_lshr_b32 s12, s9, 16
 ; VI-NEXT:    s_lshr_b32 s14, s11, 16
-; VI-NEXT:    v_cvt_f32_f16_e64 v5, |s12|
-; VI-NEXT:    v_cvt_f32_f16_e64 v4, |s14|
+; VI-NEXT:    s_lshr_b32 s12, s9, 16
+; VI-NEXT:    v_cvt_f32_f16_e64 v5, |s14|
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |s12|
 ; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr3
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v5, v4
 ; VI-NEXT:    s_cbranch_vccz .LBB10_26
 ; VI-NEXT:  ; %bb.25: ; %frem.else92
-; VI-NEXT:    s_and_b32 s0, s12, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s14, 0xffff8000
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v5, v4
 ; VI-NEXT:    v_mov_b32_e32 v3, s0
-; VI-NEXT:    v_mov_b32_e32 v6, s12
+; VI-NEXT:    v_mov_b32_e32 v6, s14
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB10_26: ; %Flow132
@@ -2939,42 +2915,38 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; VI-NEXT:    v_ldexp_f32 v3, v4, v3
 ; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; VI-NEXT:    s_and_b32 s0, s12, 0xffff8000
+; VI-NEXT:    s_and_b32 s0, s14, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; VI-NEXT:  .LBB10_32: ; %Flow133
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3fc
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x1f8
-; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s10, 3
-; VI-NEXT:    v_mov_b32_e32 v4, 0x60
-; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s8, v5
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s6, v4
-; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s6, 3
-; VI-NEXT:    v_cmp_class_f16_e64 s[4:5], s4, v5
-; VI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
-; VI-NEXT:    v_cmp_class_f16_e32 vcc, s10, v4
-; VI-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s11, v4
-; VI-NEXT:    v_cmp_class_f16_e64 s[10:11], s11, 3
-; VI-NEXT:    v_cmp_class_f16_e64 s[8:9], s9, v5
-; VI-NEXT:    s_xor_b64 s[10:11], s[10:11], -1
-; VI-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
-; VI-NEXT:    v_cmp_class_f16_e64 s[10:11], s14, v4
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], s8, v4
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], s10, v5
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    v_cmp_eq_f16_e64 s[2:3], s4, 0
+; VI-NEXT:    v_cmp_class_f16_e64 s[4:5], s4, v4
+; VI-NEXT:    v_cmp_class_f16_e64 s[6:7], s6, v5
+; VI-NEXT:    v_cmp_eq_f16_e64 vcc, s8, 0
+; VI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; VI-NEXT:    v_cmp_eq_f16_e64 s[6:7], s9, 0
+; VI-NEXT:    v_cmp_class_f16_e64 s[8:9], s9, v4
+; VI-NEXT:    v_cmp_class_f16_e64 s[10:11], s11, v5
+; VI-NEXT:    s_and_b64 s[8:9], s[8:9], s[10:11]
+; VI-NEXT:    v_cmp_eq_f16_e64 s[10:11], s12, 0
+; VI-NEXT:    v_cmp_class_f16_e64 s[12:13], s12, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; VI-NEXT:    v_cmp_class_f16_e64 s[14:15], s14, 3
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_cmp_class_f16_e64 s[12:13], s12, v5
-; VI-NEXT:    s_xor_b64 s[14:15], s[14:15], -1
+; VI-NEXT:    v_cmp_class_f16_e64 s[14:15], s14, v5
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; VI-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
+; VI-NEXT:    s_and_b64 s[12:13], s[12:13], s[14:15]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[10:11]
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[6:7]
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[12:13]
@@ -2996,34 +2968,34 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_v2f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[10:11], 0x0
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
 ; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_cbranch_vccz .LBB11_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_and_b32 s6, s2, 0x80000000
+; CI-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB11_2: ; %Flow56
-; CI-NEXT:    s_xor_b32 s6, s6, 1
-; CI-NEXT:    s_and_b32 s6, s6, 1
-; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB11_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
 ; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
 ; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
-; CI-NEXT:    v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s4|
@@ -3074,31 +3046,31 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-NEXT:    s_and_b32 s6, s2, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; CI-NEXT:    s_and_b32 s0, s2, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CI-NEXT:  .LBB11_8: ; %Flow57
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
-; CI-NEXT:    s_mov_b32 s6, 1
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr1
 ; CI-NEXT:    s_cbranch_vccz .LBB11_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else16
-; CI-NEXT:    s_and_b32 s6, s3, 0x80000000
+; CI-NEXT:    s_and_b32 s0, s3, 0x80000000
 ; CI-NEXT:    v_mov_b32_e32 v2, s5
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; CI-NEXT:    v_mov_b32_e32 v2, s6
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB11_10: ; %Flow52
-; CI-NEXT:    s_xor_b32 s6, s6, 1
-; CI-NEXT:    s_and_b32 s6, s6, 1
-; CI-NEXT:    s_cmp_lg_u32 s6, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB11_16
 ; CI-NEXT:  ; %bb.11: ; %frem.compute15
 ; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s5|
 ; CI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
-; CI-NEXT:    v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s5|
@@ -3149,61 +3121,59 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_add_f32_e32 v2, v3, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; CI-NEXT:    s_and_b32 s6, s3, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v1, s6, v1
+; CI-NEXT:    s_and_b32 s0, s3, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; CI-NEXT:  .LBB11_16: ; %Flow53
-; CI-NEXT:    v_mov_b32_e32 v2, 0x60
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v2
-; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s4, 0
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v3, 0x3fc
 ; CI-NEXT:    v_mov_b32_e32 v4, 0x1f8
-; CI-NEXT:    v_cmp_class_f32_e64 s[6:7], s4, 3
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v4
-; CI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
-; CI-NEXT:    s_and_b64 vcc, s[6:7], vcc
-; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v2
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v4
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s5, 3
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v3
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s2, v4
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s5, 0
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v3
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s3, v4
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
-; VI-NEXT:    s_mov_b32 s6, 1
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[10:11], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x20
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_cbranch_vccz .LBB11_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s6, s2, 0x80000000
+; VI-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB11_2: ; %Flow56
-; VI-NEXT:    s_xor_b32 s6, s6, 1
-; VI-NEXT:    s_and_b32 s6, s6, 1
-; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB11_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
 ; VI-NEXT:    v_ldexp_f32 v1, v1, 1
-; VI-NEXT:    v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; VI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
 ; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s2|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s2|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s4|
@@ -3254,31 +3224,31 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
-; VI-NEXT:    s_and_b32 s6, s2, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v0, s6, v0
+; VI-NEXT:    s_and_b32 s0, s2, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; VI-NEXT:  .LBB11_8: ; %Flow57
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
-; VI-NEXT:    s_mov_b32 s6, 1
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    s_cbranch_vccz .LBB11_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else16
-; VI-NEXT:    s_and_b32 s6, s3, 0x80000000
+; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT:    s_mov_b32 s6, 0
+; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB11_10: ; %Flow52
-; VI-NEXT:    s_xor_b32 s6, s6, 1
-; VI-NEXT:    s_and_b32 s6, s6, 1
-; VI-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB11_16
 ; VI-NEXT:  ; %bb.11: ; %frem.compute15
 ; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s5|
 ; VI-NEXT:    v_ldexp_f32 v2, v2, 1
-; VI-NEXT:    v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; VI-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s3|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s3|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s5|
@@ -3329,28 +3299,26 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_add_f32_e32 v2, v3, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; VI-NEXT:    v_ldexp_f32 v1, v2, v1
-; VI-NEXT:    s_and_b32 s6, s3, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v1, s6, v1
+; VI-NEXT:    s_and_b32 s0, s3, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; VI-NEXT:  .LBB11_16: ; %Flow53
-; VI-NEXT:    v_mov_b32_e32 v2, 0x60
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v2
-; VI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s4, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3fc
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x1f8
-; VI-NEXT:    v_cmp_class_f32_e64 s[6:7], s4, 3
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v4
-; VI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
-; VI-NEXT:    s_and_b64 vcc, s[6:7], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v2
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v4
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s5, 3
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v3
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s2, v4
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s5, 0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v3
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s3, v4
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s9
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
@@ -3364,37 +3332,37 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_v4f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; CI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
-; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[10:11], 0x0
+; CI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x10
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s8
+; CI-NEXT:    v_mov_b32_e32 v0, s12
 ; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
 ; CI-NEXT:    ; implicit-def: $vgpr0
 ; CI-NEXT:    s_cbranch_vccz .LBB12_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s8
+; CI-NEXT:    s_and_b32 s0, s4, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v1, s12
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB12_2: ; %Flow136
-; CI-NEXT:    s_xor_b32 s2, s2, 1
-; CI-NEXT:    s_and_b32 s2, s2, 1
-; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB12_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
-; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s8|
+; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s12|
 ; CI-NEXT:    v_ldexp_f32_e64 v1, v1, 1
-; CI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e64 v0, |s4|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s4|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s8|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s12|
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v5
 ; CI-NEXT:    v_ldexp_f32_e64 v4, v0, 12
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v6
@@ -3442,34 +3410,34 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; CI-NEXT:    s_and_b32 s0, s4, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CI-NEXT:  .LBB12_8: ; %Flow137
-; CI-NEXT:    v_mov_b32_e32 v1, s9
+; CI-NEXT:    v_mov_b32_e32 v1, s13
 ; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
-; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr1
 ; CI-NEXT:    s_cbranch_vccz .LBB12_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else16
-; CI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v2, s9
+; CI-NEXT:    s_and_b32 s0, s5, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v2, s13
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB12_10: ; %Flow132
-; CI-NEXT:    s_xor_b32 s2, s2, 1
-; CI-NEXT:    s_and_b32 s2, s2, 1
-; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB12_16
 ; CI-NEXT:  ; %bb.11: ; %frem.compute15
-; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s9|
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s13|
 ; CI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
-; CI-NEXT:    v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; CI-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s5|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s9|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s13|
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v6
 ; CI-NEXT:    v_ldexp_f32_e64 v5, v1, 12
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
@@ -3517,34 +3485,34 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_add_f32_e32 v2, v3, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; CI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v1, s2, v1
+; CI-NEXT:    s_and_b32 s0, s5, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; CI-NEXT:  .LBB12_16: ; %Flow133
-; CI-NEXT:    v_mov_b32_e32 v2, s10
+; CI-NEXT:    v_mov_b32_e32 v2, s14
 ; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
-; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr2
 ; CI-NEXT:    s_cbranch_vccz .LBB12_18
 ; CI-NEXT:  ; %bb.17: ; %frem.else50
-; CI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v3, s10
+; CI-NEXT:    s_and_b32 s0, s6, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v3, s14
 ; CI-NEXT:    v_mov_b32_e32 v2, s6
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB12_18: ; %Flow128
-; CI-NEXT:    s_xor_b32 s2, s2, 1
-; CI-NEXT:    s_and_b32 s2, s2, 1
-; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB12_24
 ; CI-NEXT:  ; %bb.19: ; %frem.compute49
-; CI-NEXT:    v_frexp_mant_f32_e64 v3, |s10|
+; CI-NEXT:    v_frexp_mant_f32_e64 v3, |s14|
 ; CI-NEXT:    v_ldexp_f32_e64 v3, v3, 1
-; CI-NEXT:    v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; CI-NEXT:    v_div_scale_f32 v5, s[0:1], v3, v3, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e64 v2, |s6|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s6|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s10|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s14|
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v7
 ; CI-NEXT:    v_ldexp_f32_e64 v6, v2, 12
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
@@ -3592,34 +3560,34 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_add_f32_e32 v3, v4, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
-; CI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v2, s2, v2
+; CI-NEXT:    s_and_b32 s0, s6, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; CI-NEXT:  .LBB12_24: ; %Flow129
-; CI-NEXT:    v_mov_b32_e32 v3, s11
+; CI-NEXT:    v_mov_b32_e32 v3, s15
 ; CI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
-; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    s_mov_b32 s0, 1
 ; CI-NEXT:    ; implicit-def: $vgpr3
 ; CI-NEXT:    s_cbranch_vccz .LBB12_26
 ; CI-NEXT:  ; %bb.25: ; %frem.else84
-; CI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v4, s11
+; CI-NEXT:    s_and_b32 s0, s7, 0x80000000
+; CI-NEXT:    v_mov_b32_e32 v4, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s7
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; CI-NEXT:    v_mov_b32_e32 v4, s2
+; CI-NEXT:    v_mov_b32_e32 v4, s0
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_mov_b32 s0, 0
 ; CI-NEXT:  .LBB12_26: ; %Flow124
-; CI-NEXT:    s_xor_b32 s2, s2, 1
-; CI-NEXT:    s_and_b32 s2, s2, 1
-; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_xor_b32 s0, s0, 1
+; CI-NEXT:    s_and_b32 s0, s0, 1
+; CI-NEXT:    s_cmp_lg_u32 s0, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB12_32
 ; CI-NEXT:  ; %bb.27: ; %frem.compute83
-; CI-NEXT:    v_frexp_mant_f32_e64 v4, |s11|
+; CI-NEXT:    v_frexp_mant_f32_e64 v4, |s15|
 ; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 1
-; CI-NEXT:    v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; CI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, 1.0
 ; CI-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
 ; CI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s7|
-; CI-NEXT:    v_frexp_exp_i32_f32_e64 v9, |s11|
+; CI-NEXT:    v_frexp_exp_i32_f32_e64 v9, |s15|
 ; CI-NEXT:    v_add_i32_e32 v5, vcc, -1, v8
 ; CI-NEXT:    v_ldexp_f32_e64 v7, v3, 12
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, -1, v9
@@ -3667,78 +3635,74 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_add_f32_e32 v4, v5, v4
 ; CI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; CI-NEXT:    v_ldexp_f32_e32 v3, v4, v3
-; CI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; CI-NEXT:    v_xor_b32_e32 v3, s2, v3
+; CI-NEXT:    s_and_b32 s0, s7, 0x80000000
+; CI-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; CI-NEXT:  .LBB12_32: ; %Flow125
-; CI-NEXT:    v_mov_b32_e32 v4, 0x60
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s8, v4
-; CI-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s12, 0
+; CI-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; CI-NEXT:    v_mov_b32_e32 v5, 0x3fc
 ; CI-NEXT:    v_mov_b32_e32 v6, 0x1f8
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s8, 3
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v6
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; CI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s9, v4
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s9, 3
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v6
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; CI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s10, v4
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s10, 3
-; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s6, v6
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; CI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s11, v4
-; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], s11, 3
-; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CI-NEXT:    v_cmp_class_f32_e32 vcc, s7, v6
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s12, v5
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s4, v6
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s13, 0
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s13, v5
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s5, v6
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s14, 0
+; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s14, v5
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s6, v6
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, s15, 0
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT:    v_cmp_class_f32_e32 vcc, s15, v5
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], s7, v6
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x40
-; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[10:11], 0x0
+; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x40
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s12
 ; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    s_cbranch_vccz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    s_and_b32 s0, s4, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v1, s12
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB12_2: ; %Flow136
-; VI-NEXT:    s_xor_b32 s2, s2, 1
-; VI-NEXT:    s_and_b32 s2, s2, 1
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB12_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s8|
+; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s12|
 ; VI-NEXT:    v_ldexp_f32 v1, v1, 1
-; VI-NEXT:    v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
 ; VI-NEXT:    v_frexp_mant_f32_e64 v0, |s4|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v5, |s4|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s8|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s12|
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v5
 ; VI-NEXT:    v_ldexp_f32 v4, v0, 12
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v6
@@ -3786,34 +3750,34 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_add_f32_e32 v1, v2, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
-; VI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
+; VI-NEXT:    s_and_b32 s0, s4, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; VI-NEXT:  .LBB12_8: ; %Flow137
-; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v1, s13
 ; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
-; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr1
 ; VI-NEXT:    s_cbranch_vccz .LBB12_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else16
-; VI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s9
+; VI-NEXT:    s_and_b32 s0, s5, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v2, s13
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB12_10: ; %Flow132
-; VI-NEXT:    s_xor_b32 s2, s2, 1
-; VI-NEXT:    s_and_b32 s2, s2, 1
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB12_16
 ; VI-NEXT:  ; %bb.11: ; %frem.compute15
-; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s9|
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s13|
 ; VI-NEXT:    v_ldexp_f32 v2, v2, 1
-; VI-NEXT:    v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; VI-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v6, |s5|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s9|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s13|
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v6
 ; VI-NEXT:    v_ldexp_f32 v5, v1, 12
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
@@ -3861,34 +3825,34 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_add_f32_e32 v2, v3, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; VI-NEXT:    v_ldexp_f32 v1, v2, v1
-; VI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v1, s2, v1
+; VI-NEXT:    s_and_b32 s0, s5, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; VI-NEXT:  .LBB12_16: ; %Flow133
-; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v2, s14
 ; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
-; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr2
 ; VI-NEXT:    s_cbranch_vccz .LBB12_18
 ; VI-NEXT:  ; %bb.17: ; %frem.else50
-; VI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v3, s10
+; VI-NEXT:    s_and_b32 s0, s6, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v3, s14
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB12_18: ; %Flow128
-; VI-NEXT:    s_xor_b32 s2, s2, 1
-; VI-NEXT:    s_and_b32 s2, s2, 1
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB12_24
 ; VI-NEXT:  ; %bb.19: ; %frem.compute49
-; VI-NEXT:    v_frexp_mant_f32_e64 v3, |s10|
+; VI-NEXT:    v_frexp_mant_f32_e64 v3, |s14|
 ; VI-NEXT:    v_ldexp_f32 v3, v3, 1
-; VI-NEXT:    v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v3, v3, 1.0
 ; VI-NEXT:    v_frexp_mant_f32_e64 v2, |s6|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v7, |s6|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s10|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s14|
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v7
 ; VI-NEXT:    v_ldexp_f32 v6, v2, 12
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
@@ -3936,34 +3900,34 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_add_f32_e32 v3, v4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; VI-NEXT:    v_ldexp_f32 v2, v3, v2
-; VI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v2, s2, v2
+; VI-NEXT:    s_and_b32 s0, s6, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; VI-NEXT:  .LBB12_24: ; %Flow129
-; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v3, s15
 ; VI-NEXT:    v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
-; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    s_mov_b32 s0, 1
 ; VI-NEXT:    ; implicit-def: $vgpr3
 ; VI-NEXT:    s_cbranch_vccz .LBB12_26
 ; VI-NEXT:  ; %bb.25: ; %frem.else84
-; VI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v4, s11
+; VI-NEXT:    s_and_b32 s0, s7, 0x80000000
+; VI-NEXT:    v_mov_b32_e32 v4, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s0, 0
 ; VI-NEXT:  .LBB12_26: ; %Flow124
-; VI-NEXT:    s_xor_b32 s2, s2, 1
-; VI-NEXT:    s_and_b32 s2, s2, 1
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_xor_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB12_32
 ; VI-NEXT:  ; %bb.27: ; %frem.compute83
-; VI-NEXT:    v_frexp_mant_f32_e64 v4, |s11|
+; VI-NEXT:    v_frexp_mant_f32_e64 v4, |s15|
 ; VI-NEXT:    v_ldexp_f32 v4, v4, 1
-; VI-NEXT:    v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, 1.0
 ; VI-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
 ; VI-NEXT:    v_frexp_exp_i32_f32_e64 v8, |s7|
-; VI-NEXT:    v_frexp_exp_i32_f32_e64 v9, |s11|
+; VI-NEXT:    v_frexp_exp_i32_f32_e64 v9, |s15|
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, -1, v8
 ; VI-NEXT:    v_ldexp_f32 v7, v3, 12
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v9
@@ -4011,42 +3975,38 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_add_f32_e32 v4, v5, v4
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; VI-NEXT:    v_ldexp_f32 v3, v4, v3
-; VI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; VI-NEXT:    v_xor_b32_e32 v3, s2, v3
+; VI-NEXT:    s_and_b32 s0, s7, 0x80000000
+; VI-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; VI-NEXT:  .LBB12_32: ; %Flow125
-; VI-NEXT:    v_mov_b32_e32 v4, 0x60
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s8, v4
-; VI-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s12, 0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; VI-NEXT:    v_mov_b32_e32 v5, 0x3fc
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x1f8
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s8, 3
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v6
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s9, v4
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s9, 3
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s5, v6
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s10, v4
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s10, 3
-; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s6, v6
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s11, v4
-; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], s11, 3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s7, v6
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s12, v5
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s4, v6
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s13, 0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s13, v5
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s5, v6
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s14, 0
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s14, v5
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s6, v6
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, s15, 0
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s15, v5
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], s7, v6
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s8
+; VI-NEXT:    v_mov_b32_e32 v5, s9
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
@@ -4060,48 +4020,48 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
 ; CI-LABEL: frem_v2f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; CI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
-; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[10:11], 0x0
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
+; CI-NEXT:    s_mov_b32 s10, 1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s8
-; CI-NEXT:    v_mov_b32_e32 v1, s9
-; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[0:1]|, |v[0:1]|
 ; CI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CI-NEXT:    s_cbranch_vccz .LBB13_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    v_mov_b32_e32 v0, s8
-; CI-NEXT:    v_mov_b32_e32 v1, s9
-; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
-; CI-NEXT:    s_mov_b32 s2, 0
-; CI-NEXT:    s_brev_b32 s3, 1
-; CI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[0:1]|, |v[0:1]|
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_brev_b32 s11, 1
+; CI-NEXT:    s_and_b64 s[10:11], s[0:1], s[10:11]
+; CI-NEXT:    v_mov_b32_e32 v0, s10
+; CI-NEXT:    v_mov_b32_e32 v1, s11
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v3, s1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_mov_b32 s10, 0
 ; CI-NEXT:  .LBB13_2: ; %Flow56
-; CI-NEXT:    s_xor_b32 s2, s2, 1
-; CI-NEXT:    s_and_b32 s2, s2, 1
-; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_xor_b32 s10, s10, 1
+; CI-NEXT:    s_and_b32 s10, s10, 1
+; CI-NEXT:    s_cmp_lg_u32 s10, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB13_8
 ; CI-NEXT:  ; %bb.3: ; %frem.compute
-; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
 ; CI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
-; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v6
 ; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v7
 ; CI-NEXT:    v_sub_i32_e32 v9, vcc, v2, v8
 ; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
-; CI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; CI-NEXT:    v_div_scale_f64 v[2:3], s[10:11], v[0:1], v[0:1], 1.0
 ; CI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
 ; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
@@ -4139,9 +4099,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:  .LBB13_7: ; %frem.loop_exit
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffe7, v9
 ; CI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
-; CI-NEXT:    s_mov_b32 s2, 0
-; CI-NEXT:    s_brev_b32 s3, 1
-; CI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_brev_b32 s11, 1
+; CI-NEXT:    s_and_b64 s[10:11], s[0:1], s[10:11]
 ; CI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
 ; CI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
@@ -4150,45 +4110,45 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; CI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; CI-NEXT:    v_xor_b32_e32 v0, s2, v0
-; CI-NEXT:    v_xor_b32_e32 v1, s3, v1
+; CI-NEXT:    v_xor_b32_e32 v0, s10, v0
+; CI-NEXT:    v_xor_b32_e32 v1, s11, v1
 ; CI-NEXT:  .LBB13_8: ; %Flow57
-; CI-NEXT:    v_mov_b32_e32 v2, s10
-; CI-NEXT:    v_mov_b32_e32 v3, s11
-; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
-; CI-NEXT:    s_mov_b32 s2, 1
+; CI-NEXT:    v_mov_b32_e32 v2, s6
+; CI-NEXT:    v_mov_b32_e32 v3, s7
+; CI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[2:3]|
+; CI-NEXT:    s_mov_b32 s10, 1
 ; CI-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CI-NEXT:    s_cbranch_vccz .LBB13_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    v_mov_b32_e32 v2, s6
+; CI-NEXT:    v_mov_b32_e32 v3, s7
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[2:3]|
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_brev_b32 s11, 1
+; CI-NEXT:    s_and_b64 s[10:11], s[2:3], s[10:11]
 ; CI-NEXT:    v_mov_b32_e32 v2, s10
 ; CI-NEXT:    v_mov_b32_e32 v3, s11
-; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
-; CI-NEXT:    s_mov_b32 s2, 0
-; CI-NEXT:    s_brev_b32 s3, 1
-; CI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
-; CI-NEXT:    v_mov_b32_e32 v4, s6
-; CI-NEXT:    v_mov_b32_e32 v5, s7
+; CI-NEXT:    v_mov_b32_e32 v4, s2
+; CI-NEXT:    v_mov_b32_e32 v5, s3
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; CI-NEXT:    s_mov_b32 s2, 0
+; CI-NEXT:    s_mov_b32 s10, 0
 ; CI-NEXT:  .LBB13_10: ; %Flow52
-; CI-NEXT:    s_xor_b32 s2, s2, 1
-; CI-NEXT:    s_and_b32 s2, s2, 1
-; CI-NEXT:    s_cmp_lg_u32 s2, 0
+; CI-NEXT:    s_xor_b32 s10, s10, 1
+; CI-NEXT:    s_and_b32 s10, s10, 1
+; CI-NEXT:    s_cmp_lg_u32 s10, 0
 ; CI-NEXT:    s_cbranch_scc1 .LBB13_16
 ; CI-NEXT:  ; %bb.11: ; %frem.compute15
-; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
-; CI-NEXT:    v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v8, |s[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e64 v9, |s[6:7]|
 ; CI-NEXT:    v_ldexp_f64 v[6:7], v[2:3], 26
-; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v8
 ; CI-NEXT:    v_add_i32_e32 v10, vcc, -1, v9
 ; CI-NEXT:    v_sub_i32_e32 v11, vcc, v4, v10
 ; CI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 1
-; CI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; CI-NEXT:    v_div_scale_f64 v[4:5], s[10:11], v[2:3], v[2:3], 1.0
 ; CI-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
 ; CI-NEXT:    v_rcp_f64_e32 v[12:13], v[4:5]
 ; CI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
@@ -4226,9 +4186,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:  .LBB13_15: ; %frem.loop_exit24
 ; CI-NEXT:    v_add_i32_e32 v6, vcc, 0xffffffe7, v11
 ; CI-NEXT:    v_ldexp_f64 v[6:7], v[8:9], v6
-; CI-NEXT:    s_mov_b32 s2, 0
-; CI-NEXT:    s_brev_b32 s3, 1
-; CI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_brev_b32 s11, 1
+; CI-NEXT:    s_and_b64 s[10:11], s[2:3], s[10:11]
 ; CI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[4:5]
 ; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
@@ -4237,79 +4197,77 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; CI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
-; CI-NEXT:    v_xor_b32_e32 v2, s2, v2
-; CI-NEXT:    v_xor_b32_e32 v3, s3, v3
+; CI-NEXT:    v_xor_b32_e32 v2, s10, v2
+; CI-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; CI-NEXT:  .LBB13_16: ; %Flow53
-; CI-NEXT:    v_mov_b32_e32 v4, 0x60
-; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[8:9], v4
-; CI-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, s[4:5], 0
+; CI-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; CI-NEXT:    v_mov_b32_e32 v5, 0x3fc
 ; CI-NEXT:    v_mov_b32_e32 v6, 0x1f8
-; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[8:9], 3
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[0:1], v6
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s11, 0xf000
 ; CI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v6
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v5
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; CI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[10:11], v4
-; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[10:11], 3
+; CI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, s[6:7], 0
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[2:3], v6
 ; CI-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v6
-; CI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; CI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v5
+; CI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x40
-; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[10:11], 0x0
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x40
+; VI-NEXT:    s_mov_b32 s10, 1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[0:1]|, |v[0:1]|
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; VI-NEXT:    s_cbranch_vccz .LBB13_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_brev_b32 s3, 1
-; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[0:1]|, |v[0:1]|
+; VI-NEXT:    s_mov_b32 s10, 0
+; VI-NEXT:    s_brev_b32 s11, 1
+; VI-NEXT:    s_and_b64 s[10:11], s[0:1], s[10:11]
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s11
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s10, 0
 ; VI-NEXT:  .LBB13_2: ; %Flow56
-; VI-NEXT:    s_xor_b32 s2, s2, 1
-; VI-NEXT:    s_and_b32 s2, s2, 1
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_xor_b32 s10, s10, 1
+; VI-NEXT:    s_and_b32 s10, s10, 1
+; VI-NEXT:    s_cmp_lg_u32 s10, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB13_8
 ; VI-NEXT:  ; %bb.3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v6, |s[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
 ; VI-NEXT:    v_ldexp_f64 v[4:5], v[0:1], 26
-; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v6
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v7
 ; VI-NEXT:    v_sub_u32_e32 v9, vcc, v2, v8
 ; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 1
-; VI-NEXT:    v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; VI-NEXT:    v_div_scale_f64 v[2:3], s[10:11], v[0:1], v[0:1], 1.0
 ; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
 ; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[2:3]
 ; VI-NEXT:    v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
@@ -4347,9 +4305,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:  .LBB13_7: ; %frem.loop_exit
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 0xffffffe7, v9
 ; VI-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v4
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_brev_b32 s3, 1
-; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    s_mov_b32 s10, 0
+; VI-NEXT:    s_brev_b32 s11, 1
+; VI-NEXT:    s_and_b64 s[10:11], s[0:1], s[10:11]
 ; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], v[2:3]
 ; VI-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
 ; VI-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
@@ -4358,45 +4316,45 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v8
-; VI-NEXT:    v_xor_b32_e32 v0, s2, v0
-; VI-NEXT:    v_xor_b32_e32 v1, s3, v1
+; VI-NEXT:    v_xor_b32_e32 v0, s10, v0
+; VI-NEXT:    v_xor_b32_e32 v1, s11, v1
 ; VI-NEXT:  .LBB13_8: ; %Flow57
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
-; VI-NEXT:    s_mov_b32 s2, 1
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[2:3]|
+; VI-NEXT:    s_mov_b32 s10, 1
 ; VI-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; VI-NEXT:    s_cbranch_vccz .LBB13_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[2:3]|
+; VI-NEXT:    s_mov_b32 s10, 0
+; VI-NEXT:    s_brev_b32 s11, 1
+; VI-NEXT:    s_and_b64 s[10:11], s[2:3], s[10:11]
 ; VI-NEXT:    v_mov_b32_e32 v2, s10
 ; VI-NEXT:    v_mov_b32_e32 v3, s11
-; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_brev_b32 s3, 1
-; VI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s6
-; VI-NEXT:    v_mov_b32_e32 v5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s2
+; VI-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s10, 0
 ; VI-NEXT:  .LBB13_10: ; %Flow52
-; VI-NEXT:    s_xor_b32 s2, s2, 1
-; VI-NEXT:    s_and_b32 s2, s2, 1
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_xor_b32 s10, s10, 1
+; VI-NEXT:    s_and_b32 s10, s10, 1
+; VI-NEXT:    s_cmp_lg_u32 s10, 0
 ; VI-NEXT:    s_cbranch_scc1 .LBB13_16
 ; VI-NEXT:  ; %bb.11: ; %frem.compute15
-; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
-; VI-NEXT:    v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v8, |s[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e64 v9, |s[6:7]|
 ; VI-NEXT:    v_ldexp_f64 v[6:7], v[2:3], 26
-; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v8
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, -1, v9
 ; VI-NEXT:    v_sub_u32_e32 v11, vcc, v4, v10
 ; VI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 1
-; VI-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; VI-NEXT:    v_div_scale_f64 v[4:5], s[10:11], v[2:3], v[2:3], 1.0
 ; VI-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
 ; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[4:5]
 ; VI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
@@ -4434,9 +4392,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:  .LBB13_15: ; %frem.loop_exit24
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0xffffffe7, v11
 ; VI-NEXT:    v_ldexp_f64 v[6:7], v[8:9], v6
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_brev_b32 s3, 1
-; VI-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT:    s_mov_b32 s10, 0
+; VI-NEXT:    s_brev_b32 s11, 1
+; VI-NEXT:    s_and_b64 s[10:11], s[2:3], s[10:11]
 ; VI-NEXT:    v_mul_f64 v[4:5], v[6:7], v[4:5]
 ; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
@@ -4445,32 +4403,30 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v10
-; VI-NEXT:    v_xor_b32_e32 v2, s2, v2
-; VI-NEXT:    v_xor_b32_e32 v3, s3, v3
+; VI-NEXT:    v_xor_b32_e32 v2, s10, v2
+; VI-NEXT:    v_xor_b32_e32 v3, s11, v3
 ; VI-NEXT:  .LBB13_16: ; %Flow53
-; VI-NEXT:    v_mov_b32_e32 v4, 0x60
-; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[8:9], v4
-; VI-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, s[4:5], 0
+; VI-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; VI-NEXT:    v_mov_b32_e32 v5, 0x3fc
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x1f8
-; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[8:9], 3
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[0:1], v6
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v6
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v5
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[10:11], v4
-; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], s[10:11], 3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, s[6:7], 0
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[2:3], v6
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v6
-; VI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; VI-NEXT:    s_and_b64 vcc, s[2:3], vcc
-; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v5
+; VI-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s8
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s9
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
    %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 0d1ed8068b9b5..82effe2914c35 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2579,6 +2579,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1032-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
 ; GFX1032-NEXT:    v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:  ; %bb.2: ; %Flow13
 ; GFX1032-NEXT:    s_andn2_saveexec_b32 s1, s1
 ; GFX1032-NEXT:    s_cbranch_execz .LBB51_8
@@ -2630,6 +2631,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1032-NEXT:  .LBB51_7: ; %Flow12
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1032-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; GFX1032-NEXT:    v_ldexp_f32 v4, v5, v4
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX1032-NEXT:    v_rndne_f32_e32 v3, v3
@@ -2638,18 +2640,14 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1032-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX1032-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX1032-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
-; GFX1032-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1032-NEXT:    v_xor_b32_e32 v1, v0, v1
 ; GFX1032-NEXT:  .LBB51_8: ; %Flow14
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032-NEXT:    v_cmp_class_f32_e64 s1, s0, 3
-; GFX1032-NEXT:    v_cmp_class_f32_e64 s0, s0, 0x60
-; GFX1032-NEXT:    v_cmp_class_f32_e64 s2, v0, 0x1f8
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s0
-; GFX1032-NEXT:    s_xor_b32 s0, s1, -1
+; GFX1032-NEXT:    v_cmp_neq_f32_e64 s1, s0, 0
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s0, s0, 0x3fc
+; GFX1032-NEXT:    s_and_b32 vcc_lo, s0, s1
 ; GFX1032-NEXT:    s_brev_b32 s1, 1
-; GFX1032-NEXT:    s_and_b32 vcc_lo, s0, s2
-; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
 ; GFX1032-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
 ; GFX1032-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 0, v0
@@ -2676,6 +2674,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1064-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
 ; GFX1064-NEXT:    v_cmp_eq_f32_e64 vcc, v0, |s6|
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:  ; %bb.2: ; %Flow13
 ; GFX1064-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB51_8
@@ -2727,6 +2726,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1064-NEXT:  .LBB51_7: ; %Flow12
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1064-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; GFX1064-NEXT:    v_ldexp_f32 v4, v5, v4
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX1064-NEXT:    v_rndne_f32_e32 v3, v3
@@ -2735,17 +2735,13 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1064-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX1064-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX1064-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
-; GFX1064-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1064-NEXT:    v_xor_b32_e32 v1, v0, v1
 ; GFX1064-NEXT:  .LBB51_8: ; %Flow14
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_cmp_class_f32_e64 s[4:5], s6, 0x60
-; GFX1064-NEXT:    v_cmp_class_f32_e64 s[0:1], s6, 3
-; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, 0x1f8
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s[4:5]
-; GFX1064-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
-; GFX1064-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
-; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc
+; GFX1064-NEXT:    v_cmp_neq_f32_e64 s[0:1], s6, 0
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], s6, 0x3fc
+; GFX1064-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc
 ; GFX1064-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_lshr_b64 s[0:1], vcc, 1
 ; GFX1064-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v0
@@ -2895,6 +2891,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1032-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
 ; GFX1032-NEXT:    v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:  ; %bb.2: ; %Flow13
 ; GFX1032-NEXT:    s_andn2_saveexec_b32 s1, s1
 ; GFX1032-NEXT:    s_cbranch_execz .LBB53_8
@@ -2946,6 +2943,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1032-NEXT:  .LBB53_7: ; %Flow12
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1032-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; GFX1032-NEXT:    v_ldexp_f32 v4, v5, v4
 ; GFX1032-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX1032-NEXT:    v_rndne_f32_e32 v3, v3
@@ -2954,17 +2952,13 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1032-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX1032-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX1032-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
-; GFX1032-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1032-NEXT:    v_xor_b32_e32 v1, v0, v1
 ; GFX1032-NEXT:  .LBB53_8: ; %Flow14
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1032-NEXT:    v_cmp_class_f32_e64 s1, s0, 3
-; GFX1032-NEXT:    v_cmp_class_f32_e64 s0, s0, 0x60
-; GFX1032-NEXT:    v_cmp_class_f32_e64 s2, v0, 0x1f8
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s0
-; GFX1032-NEXT:    s_xor_b32 s0, s1, -1
-; GFX1032-NEXT:    s_and_b32 vcc_lo, s0, s2
-; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX1032-NEXT:    v_cmp_neq_f32_e64 s1, s0, 0
+; GFX1032-NEXT:    v_cmp_class_f32_e64 s0, s0, 0x3fc
+; GFX1032-NEXT:    s_and_b32 vcc_lo, s0, s1
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
 ; GFX1032-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_lshr_b32 s0, vcc_lo, 1
 ; GFX1032-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 0, v0
@@ -2992,6 +2986,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1064-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
 ; GFX1064-NEXT:    v_cmp_eq_f32_e64 vcc, v0, |s6|
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:  ; %bb.2: ; %Flow13
 ; GFX1064-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064-NEXT:    s_cbranch_execz .LBB53_8
@@ -3043,6 +3038,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1064-NEXT:  .LBB53_7: ; %Flow12
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    v_add_nc_u32_e32 v4, -11, v4
+; GFX1064-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; GFX1064-NEXT:    v_ldexp_f32 v4, v5, v4
 ; GFX1064-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX1064-NEXT:    v_rndne_f32_e32 v3, v3
@@ -3051,17 +3047,13 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1064-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX1064-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX1064-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
-; GFX1064-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1064-NEXT:    v_xor_b32_e32 v1, v0, v1
 ; GFX1064-NEXT:  .LBB53_8: ; %Flow14
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_cmp_class_f32_e64 s[4:5], s6, 0x60
-; GFX1064-NEXT:    v_cmp_class_f32_e64 s[0:1], s6, 3
-; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, 0x1f8
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7fc00000, s[4:5]
-; GFX1064-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
-; GFX1064-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
-; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc
+; GFX1064-NEXT:    v_cmp_neq_f32_e64 s[0:1], s6, 0
+; GFX1064-NEXT:    v_cmp_class_f32_e64 s[2:3], s6, 0x3fc
+; GFX1064-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc
 ; GFX1064-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_lshr_b32 s0, vcc_lo, 1
 ; GFX1064-NEXT:    v_cmp_nlg_f32_e32 vcc, 0, v0

>From f38f769349d14637830b083db8387b318d0e922a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 18 Mar 2025 23:13:12 +0700
Subject: [PATCH 10/19] AMDGPU: Add more freeze codegen tests

---
 llvm/test/CodeGen/AMDGPU/freeze.ll | 1963 ++++++++++++++++++++++++++++
 1 file changed, 1963 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 22427ee344d91..42d6e57585345 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -1854,3 +1854,1966 @@ define void @freeze_i256(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
   store i256 %freeze, ptr addrspace(1) %ptrb, align 4
   ret void
 }
+
+define void @freeze_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load i16, ptr addrspace(1) %ptra
+  %freeze = freeze i16 %a
+  store i16 %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v2i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x i16>, ptr addrspace(1) %ptra
+  %freeze = freeze <2 x i16> %a
+  store <2 x i16> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v3i16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v3i16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-GISEL-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-GISEL-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v3i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v3i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off offset:2
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x i16>, ptr addrspace(1) %ptra
+  %freeze = freeze <3 x i16> %a
+  store <3 x i16> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v4i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v4i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x i16>, ptr addrspace(1) %ptra
+  %freeze = freeze <4 x i16> %a
+  store <4 x i16> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v8i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v8i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v8i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x i16>, ptr addrspace(1) %ptra
+  %freeze = freeze <8 x i16> %a
+  store <8 x i16> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v16i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v16i16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v16i16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16i16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x i16>, ptr addrspace(1) %ptra
+  %freeze = freeze <16 x i16> %a
+  store <16 x i16> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load half, ptr addrspace(1) %ptra
+  %freeze = freeze half %a
+  store half %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v2f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x half>, ptr addrspace(1) %ptra
+  %freeze = freeze <2 x half> %a
+  store <2 x half> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v3f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v3f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-GISEL-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-GISEL-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v3f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v3f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off offset:2
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x half>, ptr addrspace(1) %ptra
+  %freeze = freeze <3 x half> %a
+  store <3 x half> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v4f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x half>, ptr addrspace(1) %ptra
+  %freeze = freeze <4 x half> %a
+  store <4 x half> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v8f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v8f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x half>, ptr addrspace(1) %ptra
+  %freeze = freeze <8 x half> %a
+  store <8 x half> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v16f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v16f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v16f16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16f16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x half>, ptr addrspace(1) %ptra
+  %freeze = freeze <16 x half> %a
+  store <16 x half> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load bfloat, ptr addrspace(1) %ptra
+  %freeze = freeze bfloat %a
+  store bfloat %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v2bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x bfloat>, ptr addrspace(1) %ptra
+  %freeze = freeze <2 x bfloat> %a
+  store <2 x bfloat> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v3bf16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v3bf16:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-GISEL-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX10-GISEL-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v3bf16:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-SDAG-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v3bf16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x2
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off offset:2
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x bfloat>, ptr addrspace(1) %ptra
+  %freeze = freeze <3 x bfloat> %a
+  store <3 x bfloat> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v4bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x bfloat>, ptr addrspace(1) %ptra
+  %freeze = freeze <4 x bfloat> %a
+  store <4 x bfloat> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v8bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x bfloat>, ptr addrspace(1) %ptra
+  %freeze = freeze <8 x bfloat> %a
+  store <8 x bfloat> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load double, ptr addrspace(1) %ptra
+  %freeze = freeze double %a
+  store double %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v2f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x double>, ptr addrspace(1) %ptra
+  %freeze = freeze <2 x double> %a
+  store <2 x double> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v3f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v3f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v3f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v3f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b64 v[8:9], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[8:9], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v3f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[8:9], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x double>, ptr addrspace(1) %ptra
+  %freeze = freeze <3 x double> %a
+  store <3 x double> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v4f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v4f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v4f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v4f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v4f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x double>, ptr addrspace(1) %ptra
+  %freeze = freeze <4 x double> %a
+  store <4 x double> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v8f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v8f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v8f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v8f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v8f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x double>, ptr addrspace(1) %ptra
+  %freeze = freeze <8 x double> %a
+  store <8 x double> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_p0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_p0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load ptr, ptr addrspace(1) %ptra
+  %freeze = freeze ptr %a
+  store ptr %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v2p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2p0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2p0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x ptr>, ptr addrspace(1) %ptra
+  %freeze = freeze <2 x ptr> %a
+  store <2 x ptr> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v3p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v3p0:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
+; GFX10-GISEL-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[6:9], off
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v3p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b64 v[8:9], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[8:9], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v3p0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[6:9], off
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x ptr>, ptr addrspace(1) %ptra
+  %freeze = freeze <3 x ptr> %a
+  store <3 x ptr> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v4p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v4p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v4p0:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v4p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v4p0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x ptr>, ptr addrspace(1) %ptra
+  %freeze = freeze <4 x ptr> %a
+  store <4 x ptr> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v8p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v8p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v8p0:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v8p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v8p0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x ptr>, ptr addrspace(1) %ptra
+  %freeze = freeze <8 x ptr> %a
+  store <8 x ptr> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v16p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v16p0:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x7
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:96
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:112
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:96
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:112
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v16p0:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x7
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:112
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:112
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16p0:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x7
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:96
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:112
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[28:31], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:96
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:112
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[28:31], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[32:35], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16p0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x7
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:96
+; GFX11-GISEL-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:112
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:96
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[32:35], off offset:112
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x ptr>, ptr addrspace(1) %ptra
+  %freeze = freeze <16 x ptr> %a
+  store <16 x ptr> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_p1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_p1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load ptr addrspace(1), ptr addrspace(1) %ptra
+  %freeze = freeze ptr addrspace(1) %a
+  store ptr addrspace(1) %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v2p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-LABEL: freeze_v2p1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2p1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x ptr addrspace(1)>, ptr addrspace(1) %ptra
+  %freeze = freeze <2 x ptr addrspace(1)> %a
+  store <2 x ptr addrspace(1)> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v3p1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v3p1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
+; GFX10-GISEL-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[6:9], off
+; GFX10-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v3p1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b64 v[8:9], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b64 v[2:3], v[8:9], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v3p1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[6:9], v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[6:9], off
+; GFX11-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x ptr addrspace(1)>, ptr addrspace(1) %ptra
+  %freeze = freeze <3 x ptr addrspace(1)> %a
+  store <3 x ptr addrspace(1)> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v4p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v4p1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x1
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v4p1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x1
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v4p1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v4p1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x ptr addrspace(1)>, ptr addrspace(1) %ptra
+  %freeze = freeze <4 x ptr addrspace(1)> %a
+  store <4 x ptr addrspace(1)> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v8p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v8p1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x3
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v8p1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x3
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v8p1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v8p1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x ptr addrspace(1)>, ptr addrspace(1) %ptra
+  %freeze = freeze <8 x ptr addrspace(1)> %a
+  store <8 x ptr addrspace(1)> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_v16p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v16p1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_clause 0x7
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:96
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:112
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:64
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:80
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:32
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:48
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off
+; GFX10-SDAG-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:96
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:112
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:16
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v16p1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_clause 0x7
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX10-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:112
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:112
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16p1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x7
+; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:96
+; GFX11-SDAG-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:112
+; GFX11-SDAG-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:64
+; GFX11-SDAG-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:80
+; GFX11-SDAG-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:32
+; GFX11-SDAG-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:48
+; GFX11-SDAG-NEXT:    global_load_b128 v[28:31], v[0:1], off
+; GFX11-SDAG-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:96
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:112
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[28:31], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    global_store_b128 v[2:3], v[32:35], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16p1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_clause 0x7
+; GFX11-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-GISEL-NEXT:    global_load_b128 v[8:11], v[0:1], off offset:16
+; GFX11-GISEL-NEXT:    global_load_b128 v[12:15], v[0:1], off offset:32
+; GFX11-GISEL-NEXT:    global_load_b128 v[16:19], v[0:1], off offset:48
+; GFX11-GISEL-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:64
+; GFX11-GISEL-NEXT:    global_load_b128 v[24:27], v[0:1], off offset:80
+; GFX11-GISEL-NEXT:    global_load_b128 v[28:31], v[0:1], off offset:96
+; GFX11-GISEL-NEXT:    global_load_b128 v[32:35], v[0:1], off offset:112
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[12:15], off offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[16:19], off offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[20:23], off offset:64
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[24:27], off offset:80
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[28:31], off offset:96
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b128 v[2:3], v[32:35], off offset:112
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x ptr addrspace(1)>, ptr addrspace(1) %ptra
+  %freeze = freeze <16 x ptr addrspace(1)> %a
+  store <16 x ptr addrspace(1)> %freeze, ptr addrspace(1) %ptrb
+  ret void
+}
+
+define void @freeze_p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX10-LABEL: freeze_p3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    ds_read_b32 v0, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    ds_write_b32 v1, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_p3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_store_b32 v1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load ptr addrspace(3), ptr addrspace(3) %ptra
+  %freeze = freeze ptr addrspace(3) %a
+  store ptr addrspace(3) %freeze, ptr addrspace(3) %ptrb
+  ret void
+}
+
+define void @freeze_v2p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX10-LABEL: freeze_v2p3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    ds_read_b64 v[2:3], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2p3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b64 v[2:3], v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_store_b64 v1, v[2:3]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x ptr addrspace(3)>, ptr addrspace(3) %ptra
+  %freeze = freeze <2 x ptr addrspace(3)> %a
+  store <2 x ptr addrspace(3)> %freeze, ptr addrspace(3) %ptrb
+  ret void
+}
+
+define void @freeze_v3p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX10-LABEL: freeze_v3p3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    ds_read_b96 v[2:4], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    ds_write_b96 v1, v[2:4]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v3p3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b96 v[2:4], v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_store_b96 v1, v[2:4]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x ptr addrspace(3)>, ptr addrspace(3) %ptra
+  %freeze = freeze <3 x ptr addrspace(3)> %a
+  store <3 x ptr addrspace(3)> %freeze, ptr addrspace(3) %ptrb
+  ret void
+}
+
+define void @freeze_v4p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX10-LABEL: freeze_v4p3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    ds_read_b128 v[2:5], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v4p3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b128 v[2:5], v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_store_b128 v1, v[2:5]
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x ptr addrspace(3)>, ptr addrspace(3) %ptra
+  %freeze = freeze <4 x ptr addrspace(3)> %a
+  store <4 x ptr addrspace(3)> %freeze, ptr addrspace(3) %ptrb
+  ret void
+}
+
+define void @freeze_v8p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v8p3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    ds_read_b128 v[2:5], v0 offset:16
+; GFX10-SDAG-NEXT:    ds_read_b128 v[6:9], v0
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-SDAG-NEXT:    ds_write_b128 v1, v[2:5] offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-SDAG-NEXT:    ds_write_b128 v1, v[6:9]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v8p3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX10-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX10-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v8p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    ds_load_b128 v[2:5], v0 offset:16
+; GFX11-SDAG-NEXT:    ds_load_b128 v[6:9], v0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-SDAG-NEXT:    ds_store_b128 v1, v[2:5] offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-SDAG-NEXT:    ds_store_b128 v1, v[6:9]
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v8p3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    ds_load_b128 v[2:5], v0
+; GFX11-GISEL-NEXT:    ds_load_b128 v[6:9], v0 offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-GISEL-NEXT:    ds_store_b128 v1, v[2:5]
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX11-GISEL-NEXT:    ds_store_b128 v1, v[6:9] offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x ptr addrspace(3)>, ptr addrspace(3) %ptra
+  %freeze = freeze <8 x ptr addrspace(3)> %a
+  store <8 x ptr addrspace(3)> %freeze, ptr addrspace(3) %ptrb
+  ret void
+}
+
+define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX10-SDAG-LABEL: freeze_v16p3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    ds_read_b128 v[2:5], v0 offset:32
+; GFX10-SDAG-NEXT:    ds_read_b128 v[6:9], v0 offset:48
+; GFX10-SDAG-NEXT:    ds_read_b128 v[10:13], v0
+; GFX10-SDAG-NEXT:    ds_read_b128 v[14:17], v0 offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-SDAG-NEXT:    ds_write_b128 v1, v[2:5] offset:32
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-SDAG-NEXT:    ds_write_b128 v1, v[6:9] offset:48
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-SDAG-NEXT:    ds_write_b128 v1, v[10:13]
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-SDAG-NEXT:    ds_write_b128 v1, v[14:17] offset:16
+; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: freeze_v16p3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX10-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX10-GISEL-NEXT:    ds_read_b128 v[10:13], v0 offset:32
+; GFX10-GISEL-NEXT:    ds_read_b128 v[14:17], v0 offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-GISEL-NEXT:    ds_write_b128 v1, v[10:13] offset:32
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX10-GISEL-NEXT:    ds_write_b128 v1, v[14:17] offset:48
+; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16p3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    ds_load_b128 v[2:5], v0 offset:32
+; GFX11-SDAG-NEXT:    ds_load_b128 v[6:9], v0 offset:48
+; GFX11-SDAG-NEXT:    ds_load_b128 v[10:13], v0
+; GFX11-SDAG-NEXT:    ds_load_b128 v[14:17], v0 offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-SDAG-NEXT:    ds_store_b128 v1, v[2:5] offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-SDAG-NEXT:    ds_store_b128 v1, v[6:9] offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-SDAG-NEXT:    ds_store_b128 v1, v[10:13]
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-SDAG-NEXT:    ds_store_b128 v1, v[14:17] offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16p3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    ds_load_b128 v[2:5], v0
+; GFX11-GISEL-NEXT:    ds_load_b128 v[6:9], v0 offset:16
+; GFX11-GISEL-NEXT:    ds_load_b128 v[10:13], v0 offset:32
+; GFX11-GISEL-NEXT:    ds_load_b128 v[14:17], v0 offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-GISEL-NEXT:    ds_store_b128 v1, v[2:5]
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-GISEL-NEXT:    ds_store_b128 v1, v[6:9] offset:16
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-GISEL-NEXT:    ds_store_b128 v1, v[10:13] offset:32
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX11-GISEL-NEXT:    ds_store_b128 v1, v[14:17] offset:48
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x ptr addrspace(3)>, ptr addrspace(3) %ptra
+  %freeze = freeze <16 x ptr addrspace(3)> %a
+  store <16 x ptr addrspace(3)> %freeze, ptr addrspace(3) %ptrb
+  ret void
+}
+
+define void @freeze_p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX10-LABEL: freeze_p5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_p5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load ptr addrspace(5), ptr addrspace(5) %ptra
+  %freeze = freeze ptr addrspace(5) %a
+  store ptr addrspace(5) %freeze, ptr addrspace(5) %ptrb
+  ret void
+}
+
+define void @freeze_v2p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX10-LABEL: freeze_v2p5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v2p5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b64 v[2:3], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b64 v1, v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <2 x ptr addrspace(5)>, ptr addrspace(5) %ptra
+  %freeze = freeze <2 x ptr addrspace(5)> %a
+  store <2 x ptr addrspace(5)> %freeze, ptr addrspace(5) %ptrb
+  ret void
+}
+
+define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX10-LABEL: freeze_v3p5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v3p5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b96 v[2:4], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b96 v1, v[2:4], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <3 x ptr addrspace(5)>, ptr addrspace(5) %ptra
+  %freeze = freeze <3 x ptr addrspace(5)> %a
+  store <3 x ptr addrspace(5)> %freeze, ptr addrspace(5) %ptrb
+  ret void
+}
+
+define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX10-LABEL: freeze_v4p5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: freeze_v4p5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b128 v[2:5], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 v1, v[2:5], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <4 x ptr addrspace(5)>, ptr addrspace(5) %ptra
+  %freeze = freeze <4 x ptr addrspace(5)> %a
+  store <4 x ptr addrspace(5)> %freeze, ptr addrspace(5) %ptrb
+  ret void
+}
+
+define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX10-LABEL: freeze_v8p5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v8p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    scratch_load_b128 v[2:5], v0, off offset:16
+; GFX11-SDAG-NEXT:    scratch_load_b128 v[6:9], v0, off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    scratch_store_b128 v1, v[2:5], off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    scratch_store_b128 v1, v[6:9], off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v8p5:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v6, 16, v0
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    scratch_load_b128 v[2:5], v0, off
+; GFX11-GISEL-NEXT:    scratch_load_b128 v[6:9], v6, off
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 16, v1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    scratch_store_b128 v1, v[2:5], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    scratch_store_b128 v0, v[6:9], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <8 x ptr addrspace(5)>, ptr addrspace(5) %ptra
+  %freeze = freeze <8 x ptr addrspace(5)> %a
+  store <8 x ptr addrspace(5)> %freeze, ptr addrspace(5) %ptrb
+  ret void
+}
+
+define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX10-LABEL: freeze_v16p5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0xf
+; GFX10-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_load_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_load_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_load_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_load_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_load_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_load_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_load_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(14)
+; GFX10-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(13)
+; GFX10-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(12)
+; GFX10-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt vmcnt(11)
+; GFX10-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(10)
+; GFX10-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: freeze_v16p5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x3
+; GFX11-SDAG-NEXT:    scratch_load_b128 v[2:5], v0, off offset:32
+; GFX11-SDAG-NEXT:    scratch_load_b128 v[6:9], v0, off offset:48
+; GFX11-SDAG-NEXT:    scratch_load_b128 v[10:13], v0, off
+; GFX11-SDAG-NEXT:    scratch_load_b128 v[14:17], v0, off offset:16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-SDAG-NEXT:    scratch_store_b128 v1, v[2:5], off offset:32
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-SDAG-NEXT:    scratch_store_b128 v1, v[6:9], off offset:48
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-SDAG-NEXT:    scratch_store_b128 v1, v[10:13], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    scratch_store_b128 v1, v[14:17], off offset:16
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_v16p5:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v6, 16, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v10, 32, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v14, 48, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v18, 32, v1
+; GFX11-GISEL-NEXT:    s_clause 0x3
+; GFX11-GISEL-NEXT:    scratch_load_b128 v[2:5], v0, off
+; GFX11-GISEL-NEXT:    scratch_load_b128 v[6:9], v6, off
+; GFX11-GISEL-NEXT:    scratch_load_b128 v[10:13], v10, off
+; GFX11-GISEL-NEXT:    scratch_load_b128 v[14:17], v14, off
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 16, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v19, 48, v1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-GISEL-NEXT:    scratch_store_b128 v1, v[2:5], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-GISEL-NEXT:    scratch_store_b128 v0, v[6:9], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-GISEL-NEXT:    scratch_store_b128 v18, v[10:13], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    scratch_store_b128 v19, v[14:17], off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a = load <16 x ptr addrspace(5)>, ptr addrspace(5) %ptra
+  %freeze = freeze <16 x ptr addrspace(5)> %a
+  store <16 x ptr addrspace(5)> %freeze, ptr addrspace(5) %ptrb
+  ret void
+}

>From 56369a858465566929b64c5417a7d3c97e76ea64 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 18 Mar 2025 23:14:46 +0700
Subject: [PATCH 11/19] DAG: Fix promote of half freeze

Round out the AMDGPU codegen test to all the generations to cover
the illegal f16 targets.
---
 .../SelectionDAG/LegalizeFloatTypes.cpp       |    5 +-
 llvm/test/CodeGen/AMDGPU/freeze.ll            | 8213 +++++++++++++++++
 2 files changed, 8217 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index c2107a73301bc..ba08de71bf9c1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2756,7 +2756,10 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
 #endif
       report_fatal_error("Do not know how to promote this operator's result!");
 
-    case ISD::BITCAST:    R = PromoteFloatRes_BITCAST(N); break;
+    case ISD::BITCAST:
+    case ISD::FREEZE:
+      R = PromoteFloatRes_BITCAST(N);
+      break;
     case ISD::ConstantFP: R = PromoteFloatRes_ConstantFP(N); break;
     case ISD::EXTRACT_VECTOR_ELT:
                           R = PromoteFloatRes_EXTRACT_VECTOR_ELT(N); break;
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 42d6e57585345..96725e6996e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -1,10 +1,90 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 < %s | FileCheck -check-prefixes=GFX6,GFX6-SDAG  %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 < %s | FileCheck -check-prefixes=GFX6,GFX6-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG  %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG  %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX8-SDAG  %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG  %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define void @freeze_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -27,6 +107,80 @@ define void @freeze_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64 offset:8
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:8
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx3 v[4:6], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx3 v[4:6], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx3 v[4:6], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx3 v[4:6], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v3i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[4:6], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx3 v[2:3], v[4:6]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v3i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx3 v[2:3], v[4:6], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v3i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49,6 +203,74 @@ define void @freeze_v3i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v4i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v4i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -71,6 +293,96 @@ define void @freeze_v4i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v5i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v5i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v8, v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v5i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v5i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dword v8, v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v5i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v5i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dword v8, v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v8
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v5i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dword v8, v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v8, off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v5i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -125,6 +437,96 @@ define void @freeze_v5i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v6i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v6i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v6i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v6i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v6i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v6i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v6i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[8:9], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v6i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -179,6 +581,102 @@ define void @freeze_v6i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v7i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v7i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v10, v[0:1], s[4:7], 0 addr64 offset:24
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dword v10, v[2:3], s[4:7], 0 addr64 offset:24
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v7i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:24
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:24
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v7i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx3 v[8:10], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx3 v[8:10], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v7i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx3 v[8:10], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx3 v[8:10], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v7i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx3 v[8:10], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx3 v[0:1], v[8:10]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v7i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx3 v[8:10], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[8:10], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v7i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -233,6 +731,96 @@ define void @freeze_v7i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v8i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v8i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v8i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -287,6 +875,118 @@ define void @freeze_v8i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v9i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v9i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v12, v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dword v12, v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v9i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v9i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v12, v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dword v12, v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v9i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v9i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dword v14, v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dword v[12:13], v14
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v9i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dword v12, v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v12, off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v9i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -353,6 +1053,118 @@ define void @freeze_v9i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v10i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v10i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v10i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v10i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v10i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v10i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[14:15], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v10i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[0:1], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[12:13], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v10i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -389,6 +1201,124 @@ define void @freeze_v10i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v11i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v11i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v14, v[0:1], s[4:7], 0 addr64 offset:40
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dword v14, v[2:3], s[4:7], 0 addr64 offset:40
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[12:13], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v11i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:40
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[12:13], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:40
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v11i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx3 v[12:14], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx3 v[12:14], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v11i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx3 v[12:14], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx3 v[12:14], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v11i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx3 v[12:14], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v15, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx3 v[15:16], v[12:14]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v11i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx3 v[12:14], v[0:1], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx3 v[2:3], v[12:14], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v11i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -425,6 +1355,118 @@ define void @freeze_v11i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v12i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v12i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v12i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v12i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v12i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v12i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v12i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v12i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -460,6 +1502,140 @@ define void @freeze_v12i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
   ret void
 }
 define void @freeze_v13i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v13i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v16, v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dword v16, v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v13i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v13i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v16, v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dword v16, v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v13i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v13i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dword v18, v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dword v[2:3], v18
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v13i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dword v16, v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v16, off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v13i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,6 +1714,140 @@ define void @freeze_v13i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v14i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v14i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[16:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[16:17], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v14i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v14i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[16:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[16:17], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v14i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v14i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v14i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[16:17], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[16:17], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v14i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -616,6 +1926,150 @@ define void @freeze_v14i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v15i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v15i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v18, v[0:1], s[4:7], 0 addr64 offset:56
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[16:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dword v18, v[2:3], s[4:7], 0 addr64 offset:56
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[16:17], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v15i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[16:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:56
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[16:17], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:56
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v15i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v18, v[0:1], s[4:7], 0 addr64 offset:56
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[16:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dword v18, v[2:3], s[4:7], 0 addr64 offset:56
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[16:17], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v15i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx3 v[16:18], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx3 v[16:18], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v15i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx3 v[16:18], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx3 v[2:3], v[16:18]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v15i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx3 v[16:18], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[16:18], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v15i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -694,6 +2148,141 @@ define void @freeze_v15i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v16i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v16i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v16i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -772,6 +2361,160 @@ define void @freeze_v16i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v17i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v17i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v20, v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dword v20, v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v17i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v17i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v20, v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dword v20, v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v17i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v17i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dword v20, v[18:19]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[12:15]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    flat_store_dword v[2:3], v20
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v17i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dword v20, v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v20, off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v17i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -862,6 +2605,161 @@ define void @freeze_v17i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v18i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v18i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v18i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v18i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v18i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v18i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[18:19]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[12:15]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[16:19]
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v18i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[20:21], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v18i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -952,6 +2850,169 @@ define void @freeze_v18i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v19i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v19i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v19i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v22, v[0:1], s[4:7], 0 addr64 offset:72
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[20:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dword v22, v[2:3], s[4:7], 0 addr64 offset:72
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[20:21], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v19i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx3 v[20:22], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx3 v[20:22], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v19i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx3 v[20:22], v[20:21]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx3 v[6:7], v[20:22]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v19i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx3 v[20:22], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[20:22], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v19i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1042,6 +3103,163 @@ define void @freeze_v19i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v20i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v20i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v20i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v20i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v20i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v20i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[20:23]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v20i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v20i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1132,6 +3350,185 @@ define void @freeze_v20i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v21i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dword v24, v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dword v24, v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v21i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v21i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dword v24, v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dword v24, v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v21i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v21i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, v0, v6
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dword v26, v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v24, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 0x50, v2
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v8, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-GISEL-NEXT:    flat_store_dword v[6:7], v26
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v21i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dword v24, v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dword v[2:3], v24, off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v21i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1234,6 +3631,185 @@ define void @freeze_v21i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v22i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[24:25], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[24:25], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v22i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v22i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[24:25], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[24:25], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v22i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v22i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, v0, v6
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[24:25], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v26, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v27, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[26:27], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 0x50, v2
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v8, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[6:7], v[24:25]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v22i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[24:25], v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[24:25], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v22i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1336,6 +3912,235 @@ define void @freeze_v22i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v30i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v30i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v30i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v30i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v30i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v34, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v24, vcc, v0, v34
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x60
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v28, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x70
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v32, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v33, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[32:33], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v32, vcc, v2, v34
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v34, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v35, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v33, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 0x60, v2
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[34:35], v[20:23]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[28:31]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v30i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX9-GISEL-NEXT:    s_nop 0
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v30i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1462,6 +4267,239 @@ define void @freeze_v30i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v31i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v31i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(8)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v31i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dword v34, v[0:1], s[4:7], 0 addr64 offset:120
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[32:33], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dword v34, v[2:3], s[4:7], 0 addr64 offset:120
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[32:33], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v31i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    buffer_load_dwordx3 v[32:34], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx3 v[32:34], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v31i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 64, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v35, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v24, vcc, v0, v35
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x60
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v28, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x70
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-GISEL-NEXT:    flat_load_dwordx3 v[32:34], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, v2, v35
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v8, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 0x60, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[24:27]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[10:11], v[28:31]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx3 v[2:3], v[32:34]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v31i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX9-GISEL-NEXT:    global_load_dwordx3 v[32:34], v[0:1], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx3 v[2:3], v[32:34], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v31i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1588,6 +4626,233 @@ define void @freeze_v31i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v32i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v32i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v32i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v32i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v32i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v32i32:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 64, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v38, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v24, vcc, v0, v38
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x60
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v28, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x70
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v36, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v37, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[36:37], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, v2, v38
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v8, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 0x60, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[24:27]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[10:11], v[28:31]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[32:35]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v32i32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v32i32:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1714,6 +4979,74 @@ define void @freeze_v32i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_i32:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_i32:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_i32:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_i32:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1736,6 +5069,74 @@ define void @freeze_i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_i64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_i64:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_i64:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_i64:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_i64:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_i64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1758,6 +5159,74 @@ define void @freeze_i64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_float(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_float:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_float:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_float:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_float:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_float:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_float:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_float:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1780,6 +5249,74 @@ define void @freeze_float(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_i128(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_i128:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_i128:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_i128:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_i128:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1802,6 +5339,96 @@ define void @freeze_i128(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_i256(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_i256:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_i256:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_i256:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_i256:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_i256:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_i256:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_i256:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1856,6 +5483,74 @@ define void @freeze_i256(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_i16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_i16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1878,6 +5573,74 @@ define void @freeze_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v2i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2i16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2i16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1900,6 +5663,99 @@ define void @freeze_v2i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX6-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_store_short v4, v[2:3], s[4:7], 0 addr64 offset:2
+; GFX6-GISEL-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3i16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX7-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3i16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_store_short v4, v[2:3], s[4:7], 0 addr64 offset:2
+; GFX7-GISEL-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX8-GISEL-NEXT:    flat_store_short v[2:3], v0
+; GFX8-GISEL-NEXT:    flat_store_short v[4:5], v8
+; GFX8-GISEL-NEXT:    flat_store_short v[6:7], v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v3i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-GISEL-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-GISEL-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v3i16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1946,6 +5802,74 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4i16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4i16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v4i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1968,6 +5892,74 @@ define void @freeze_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8i16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8i16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v8i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v8i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v8i16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1990,6 +5982,96 @@ define void @freeze_v8i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v16i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16i16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16i16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16i16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16i16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16i16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v16i16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v16i16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2044,6 +6126,74 @@ define void @freeze_v16i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_f16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_f16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_f16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_f16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2066,6 +6216,74 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v2f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2f16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2f16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2f16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2f16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2088,6 +6306,91 @@ define void @freeze_v2f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3f16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3f16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_store_short v4, v[2:3], s[4:7], 0 addr64 offset:2
+; GFX6-GISEL-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3f16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3f16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_store_short v4, v[2:3], s[4:7], 0 addr64 offset:2
+; GFX7-GISEL-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3f16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX8-GISEL-NEXT:    flat_store_short v[2:3], v0
+; GFX8-GISEL-NEXT:    flat_store_short v[4:5], v8
+; GFX8-GISEL-NEXT:    flat_store_short v[6:7], v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v3f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-GISEL-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-GISEL-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2134,6 +6437,74 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4f16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4f16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4f16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4f16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2156,6 +6527,74 @@ define void @freeze_v4f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8f16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8f16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8f16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8f16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v8f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v8f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2178,6 +6617,96 @@ define void @freeze_v8f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v16f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16f16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16f16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16f16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16f16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16f16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v16f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v16f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2232,6 +6761,80 @@ define void @freeze_v16f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_bf16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_bf16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_bf16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_bf16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2254,6 +6857,74 @@ define void @freeze_bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v2bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2bf16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2bf16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2bf16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2bf16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2276,6 +6947,109 @@ define void @freeze_v2bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3bf16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX6-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX6-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-SDAG-NEXT:    v_alignbit_b32 v0, v4, v0, 16
+; GFX6-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3bf16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX6-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_store_short v4, v[2:3], s[4:7], 0 addr64 offset:2
+; GFX6-GISEL-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3bf16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-SDAG-NEXT:    v_alignbit_b32 v0, v4, v0, 16
+; GFX7-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3bf16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX7-GISEL-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_store_short v4, v[2:3], s[4:7], 0 addr64 offset:2
+; GFX7-GISEL-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3bf16:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 2, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX8-GISEL-NEXT:    flat_store_short v[2:3], v0
+; GFX8-GISEL-NEXT:    flat_store_short v[4:5], v8
+; GFX8-GISEL-NEXT:    flat_store_short v[6:7], v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v3bf16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-GISEL-NEXT:    global_store_short_d16_hi v[2:3], v0, off offset:2
+; GFX9-GISEL-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v3bf16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2322,6 +7096,74 @@ define void @freeze_v3bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4bf16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4bf16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4bf16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4bf16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2344,6 +7186,74 @@ define void @freeze_v4bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8bf16:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8bf16:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8bf16:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8bf16:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2366,6 +7276,74 @@ define void @freeze_v8bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_f64:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_f64:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_f64:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_f64:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2388,6 +7366,74 @@ define void @freeze_f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v2f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2f64:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2f64:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2f64:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2f64:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2410,6 +7456,96 @@ define void @freeze_v2f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3f64:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3f64:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3f64:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3f64:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3f64:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v3f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[8:9], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v3f64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2464,6 +7600,96 @@ define void @freeze_v3f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4f64:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4f64:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4f64:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4f64:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v4f64:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v4f64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2518,6 +7744,141 @@ define void @freeze_v4f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8f64:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8f64:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8f64:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8f64:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v8f64:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v8f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v8f64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2596,6 +7957,74 @@ define void @freeze_v8f64(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_p0:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_p0:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_p0:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_p0:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_p0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_p0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_p0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2618,6 +8047,74 @@ define void @freeze_p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v2p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2p0:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2p0:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2p0:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2p0:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2p0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2p0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2p0:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2640,6 +8137,105 @@ define void @freeze_v2p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3p0:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3p0:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3p0:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3p0:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3p0:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v3p0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
+; GFX9-GISEL-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[6:9], off
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v3p0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2699,6 +8295,96 @@ define void @freeze_v3p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4p0:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4p0:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4p0:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4p0:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v4p0:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v4p0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v4p0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2753,6 +8439,141 @@ define void @freeze_v4p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8p0:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8p0:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8p0:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8p0:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v8p0:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v8p0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v8p0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2831,6 +8652,233 @@ define void @freeze_v8p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v16p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16p0:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16p0:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16p0:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16p0:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16p0:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 64, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v38, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v24, vcc, v0, v38
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x60
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v28, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x70
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v36, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v37, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[36:37], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, v2, v38
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v8, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 0x60, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[24:27]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[10:11], v[28:31]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[32:35]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v16p0:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v16p0:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2957,6 +9005,74 @@ define void @freeze_v16p0(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_p1:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_p1:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_p1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_p1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_p1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_p1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_p1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2979,6 +9095,74 @@ define void @freeze_p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v2p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2p1:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2p1:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2p1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2p1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2p1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2p1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2p1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3001,6 +9185,105 @@ define void @freeze_v2p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3p1:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3p1:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3p1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3p1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[6:9], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3p1:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 16, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v3p1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
+; GFX9-GISEL-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[6:9], off
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v3p1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3060,6 +9343,96 @@ define void @freeze_v3p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v4p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4p1:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4p1:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4p1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4p1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v4p1:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v4p1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v4p1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3114,6 +9487,141 @@ define void @freeze_v4p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v8p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8p1:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8p1:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8p1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8p1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v8p1:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v8p1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v8p1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3192,6 +9700,233 @@ define void @freeze_v8p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_v16p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16p1:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16p1:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX6-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16p1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX7-SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-SDAG-NEXT:    s_mov_b32 s4, s6
+; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16p1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX7-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[24:27], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    buffer_load_dwordx4 v[32:35], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[12:15], v[2:3], s[4:7], 0 addr64 offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[16:19], v[2:3], s[4:7], 0 addr64 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[20:23], v[2:3], s[4:7], 0 addr64 offset:64
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[24:27], v[2:3], s[4:7], 0 addr64 offset:80
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[28:31], v[2:3], s[4:7], 0 addr64 offset:96
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dwordx4 v[32:35], v[2:3], s[4:7], 0 addr64 offset:112
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16p1:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 48, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v20, vcc, 64, v0
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v38, 0x50
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v24, vcc, v0, v38
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x60
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v28, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v14, 0x70
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-GISEL-NEXT:    flat_load_dwordx4 v[32:35], v[0:1]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v36, vcc, 16, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v37, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 32, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[36:37], v[4:7]
+; GFX8-GISEL-NEXT:    s_nop 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 48, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, v2, v38
+; GFX8-GISEL-NEXT:    v_add_u32_e64 v8, s[4:5], 64, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    s_mov_b64 s[4:5], vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 0x60, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v2
+; GFX8-GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, v3, s[4:5]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[16:19]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[8:9], v[20:23]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[6:7], v[24:27]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[10:11], v[28:31]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    flat_store_dwordx4 v[2:3], v[32:35]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v16p1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:32
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:48
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:64
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:80
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:96
+; GFX9-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off offset:64
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[24:27], off offset:80
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off offset:96
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    global_store_dwordx4 v[2:3], v[32:35], off offset:112
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v16p1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3318,6 +10053,45 @@ define void @freeze_v16p1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 }
 
 define void @freeze_p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX6-LABEL: freeze_p3:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    ds_read_b32 v0, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b32 v1, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: freeze_p3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_p3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_write_b32 v1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_p3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_p3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3342,6 +10116,45 @@ define void @freeze_p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 }
 
 define void @freeze_v2p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX6-LABEL: freeze_v2p3:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    ds_read_b64 v[2:3], v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: freeze_v2p3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b64 v[2:3], v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v2p3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b64 v[2:3], v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2p3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b64 v[2:3], v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2p3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3366,6 +10179,65 @@ define void @freeze_v2p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 }
 
 define void @freeze_v3p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3p3:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX6-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX6-SDAG-NEXT:    ds_read_b32 v4, v2
+; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-SDAG-NEXT:    ds_write_b32 v0, v4
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-SDAG-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3p3:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX6-GISEL-NEXT:    ds_read_b64 v[2:3], v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX6-GISEL-NEXT:    ds_read_b32 v0, v0
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-GISEL-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 8, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-GISEL-NEXT:    ds_write_b32 v1, v0
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: freeze_v3p3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b96 v[2:4], v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b96 v1, v[2:4]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v3p3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b96 v[2:4], v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_write_b96 v1, v[2:4]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v3p3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b96 v[2:4], v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b96 v1, v[2:4]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v3p3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3390,6 +10262,50 @@ define void @freeze_v3p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 }
 
 define void @freeze_v4p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX6-LABEL: freeze_v4p3:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    ds_read_b64 v[2:3], v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX6-NEXT:    ds_read_b64 v[4:5], v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-NEXT:    ds_write_b64 v0, v[4:5]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: freeze_v4p3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    ds_read_b128 v[2:5], v0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_v4p3:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b128 v[2:5], v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v4p3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b128 v[2:5], v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v4p3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3414,6 +10330,105 @@ define void @freeze_v4p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 }
 
 define void @freeze_v8p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8p3:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; GFX6-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v2
+; GFX6-SDAG-NEXT:    ds_read_b64 v[4:5], v4
+; GFX6-SDAG-NEXT:    ds_read_b64 v[6:7], v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX6-SDAG-NEXT:    ds_read_b64 v[8:9], v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[4:5]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 24, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[2:3]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX6-SDAG-NEXT:    ds_write_b64 v1, v[6:7]
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[8:9]
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8p3:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; GFX6-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX6-GISEL-NEXT:    ds_read_b64 v[2:3], v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 24, v0
+; GFX6-GISEL-NEXT:    ds_read_b64 v[4:5], v4
+; GFX6-GISEL-NEXT:    ds_read_b64 v[6:7], v6
+; GFX6-GISEL-NEXT:    ds_read_b64 v[8:9], v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[4:5]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[6:7]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 24, v1
+; GFX6-GISEL-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[8:9]
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8p3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX7-SDAG-NEXT:    ds_read_b128 v[2:5], v0 offset:16
+; GFX7-SDAG-NEXT:    ds_read_b128 v[6:9], v0
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-SDAG-NEXT:    ds_write_b128 v1, v[2:5] offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-SDAG-NEXT:    ds_write_b128 v1, v[6:9]
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8p3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX7-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX7-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX7-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v8p3:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX8-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX8-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX8-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX8-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v8p3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX9-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v8p3:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3468,6 +10483,164 @@ define void @freeze_v8p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 }
 
 define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16p3:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; GFX6-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 32, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v10, vcc, 48, v0
+; GFX6-SDAG-NEXT:    ds_read_b64 v[2:3], v2
+; GFX6-SDAG-NEXT:    ds_read_b64 v[4:5], v4
+; GFX6-SDAG-NEXT:    ds_read_b64 v[6:7], v6
+; GFX6-SDAG-NEXT:    ds_read_b64 v[8:9], v0
+; GFX6-SDAG-NEXT:    ds_read_b64 v[10:11], v10
+; GFX6-SDAG-NEXT:    ds_read_b64 v[12:13], v12
+; GFX6-SDAG-NEXT:    ds_read_b64 v[14:15], v14
+; GFX6-SDAG-NEXT:    ds_read_b64 v[16:17], v16
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 48, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[10:11]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 56, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[16:17]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 32, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[14:15]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 40, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[12:13]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[4:5]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 24, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[2:3]
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-SDAG-NEXT:    ds_write_b64 v1, v[8:9]
+; GFX6-SDAG-NEXT:    ds_write_b64 v0, v[6:7]
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16p3:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
+; GFX6-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX6-GISEL-NEXT:    ds_read_b64 v[2:3], v0
+; GFX6-GISEL-NEXT:    ds_read_b64 v[4:5], v4
+; GFX6-GISEL-NEXT:    ds_read_b64 v[6:7], v6
+; GFX6-GISEL-NEXT:    ds_read_b64 v[8:9], v8
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 56, v0
+; GFX6-GISEL-NEXT:    ds_read_b64 v[10:11], v10
+; GFX6-GISEL-NEXT:    ds_read_b64 v[12:13], v12
+; GFX6-GISEL-NEXT:    ds_read_b64 v[14:15], v14
+; GFX6-GISEL-NEXT:    ds_read_b64 v[16:17], v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 8, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[4:5]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[6:7]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 24, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[8:9]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 32, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[10:11]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 40, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[12:13]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 48, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[14:15]
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 56, v1
+; GFX6-GISEL-NEXT:    ds_write_b64 v1, v[2:3]
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX6-GISEL-NEXT:    ds_write_b64 v0, v[16:17]
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16p3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX7-SDAG-NEXT:    ds_read_b128 v[2:5], v0 offset:32
+; GFX7-SDAG-NEXT:    ds_read_b128 v[6:9], v0 offset:48
+; GFX7-SDAG-NEXT:    ds_read_b128 v[10:13], v0
+; GFX7-SDAG-NEXT:    ds_read_b128 v[14:17], v0 offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-SDAG-NEXT:    ds_write_b128 v1, v[2:5] offset:32
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-SDAG-NEXT:    ds_write_b128 v1, v[6:9] offset:48
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-SDAG-NEXT:    ds_write_b128 v1, v[10:13]
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-SDAG-NEXT:    ds_write_b128 v1, v[14:17] offset:16
+; GFX7-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16p3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX7-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX7-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX7-GISEL-NEXT:    ds_read_b128 v[10:13], v0 offset:32
+; GFX7-GISEL-NEXT:    ds_read_b128 v[14:17], v0 offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-GISEL-NEXT:    ds_write_b128 v1, v[10:13] offset:32
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-GISEL-NEXT:    ds_write_b128 v1, v[14:17] offset:48
+; GFX7-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16p3:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX8-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX8-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX8-GISEL-NEXT:    ds_read_b128 v[10:13], v0 offset:32
+; GFX8-GISEL-NEXT:    ds_read_b128 v[14:17], v0 offset:48
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX8-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX8-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX8-GISEL-NEXT:    ds_write_b128 v1, v[10:13] offset:32
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX8-GISEL-NEXT:    ds_write_b128 v1, v[14:17] offset:48
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: freeze_v16p3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    ds_read_b128 v[2:5], v0
+; GFX9-GISEL-NEXT:    ds_read_b128 v[6:9], v0 offset:16
+; GFX9-GISEL-NEXT:    ds_read_b128 v[10:13], v0 offset:32
+; GFX9-GISEL-NEXT:    ds_read_b128 v[14:17], v0 offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-GISEL-NEXT:    ds_write_b128 v1, v[2:5]
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-GISEL-NEXT:    ds_write_b128 v1, v[6:9] offset:16
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-GISEL-NEXT:    ds_write_b128 v1, v[10:13] offset:32
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-GISEL-NEXT:    ds_write_b128 v1, v[14:17] offset:48
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: freeze_v16p3:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3546,6 +10719,42 @@ define void @freeze_v16p3(ptr addrspace(3) %ptra, ptr addrspace(3) %ptrb) {
 }
 
 define void @freeze_p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX6-LABEL: freeze_p5:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: freeze_p5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: freeze_p5:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_p5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_p5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3568,6 +10777,88 @@ define void @freeze_p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 }
 
 define void @freeze_v2p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v2p5:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 4, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v2p5:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 4, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v2p5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 4, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v2p5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 4, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v2p5:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 4, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v2p5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v2p5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3594,6 +10885,114 @@ define void @freeze_v2p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 }
 
 define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v3p5:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 4, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 8, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v3p5:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 4, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 8, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX6-GISEL-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v3p5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 8, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 4, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 8, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v3p5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 4, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 8, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-GISEL-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v3p5:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 8, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 4, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v5, vcc, 8, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-GISEL-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v3p5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v3p5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3623,6 +11022,140 @@ define void @freeze_v3p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 }
 
 define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v4p5:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v4p5:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 12, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 8, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX6-GISEL-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v4p5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v4p5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 12, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 8, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-GISEL-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v4p5:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 8, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 12, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v5, vcc, 4, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 8, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 12, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-GISEL-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v4p5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v4p5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3655,6 +11188,244 @@ define void @freeze_v4p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 }
 
 define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v8p5:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 4, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 28, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v9, vcc, 4, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v10, vcc, 8, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v11, vcc, 12, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v12, vcc, 16, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 20, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 24, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v15, vcc, 28, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX6-SDAG-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v5, v11, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v4, v12, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v3, v13, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v2, v14, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v15, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v8p5:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 12, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 20, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 28, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 4, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 8, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 12, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 16, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v13, vcc, 20, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 24, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v15, vcc, 28, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v2, v9, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v4, v11, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v6, v13, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v7, v14, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-GISEL-NEXT:    buffer_store_dword v8, v15, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v8p5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 4, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 28, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v9, vcc, 4, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v10, vcc, 8, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v11, vcc, 12, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v12, vcc, 16, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 20, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 24, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v15, vcc, 28, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-SDAG-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v6, v10, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v5, v11, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v4, v12, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v3, v13, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v2, v14, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v15, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v8p5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 12, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 20, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 28, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 4, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 8, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 12, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 16, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v13, vcc, 20, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 24, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v15, vcc, 28, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v2, v9, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v4, v11, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v6, v13, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v7, v14, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-GISEL-NEXT:    buffer_store_dword v8, v15, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v8p5:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 8, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v4, vcc, 12, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v5, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 20, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 24, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 28, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v9, vcc, 4, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 8, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v11, vcc, 12, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 16, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v13, vcc, 20, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 24, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v15, vcc, 28, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v2, v9, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v4, v11, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v6, v13, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v7, v14, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX8-GISEL-NEXT:    buffer_store_dword v8, v15, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v8p5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v8p5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3717,6 +11488,446 @@ define void @freeze_v8p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
 }
 
 define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
+; GFX6-SDAG-LABEL: freeze_v16p5:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v12, vcc, 28, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 24, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v15, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v16, vcc, 32, v0
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; GFX6-SDAG-NEXT:    buffer_load_dword v16, v16, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 4, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 8, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 12, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT:    buffer_store_dword v7, v18, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 16, v1
+; GFX6-SDAG-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 40, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 20, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 24, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 32, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 44, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-SDAG-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX6-SDAG-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v13, v7, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v12, v18, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX6-SDAG-NEXT:    buffer_store_dword v16, v6, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 48, v1
+; GFX6-SDAG-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 52, v1
+; GFX6-SDAG-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 56, v1
+; GFX6-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
+; GFX6-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: freeze_v16p5:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v15, vcc, 52, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v15, v15, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v16, v16, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; GFX6-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v17, vcc, 4, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v18, vcc, 8, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v19, vcc, 12, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt expcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 16, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v17, vcc, 20, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(12)
+; GFX6-GISEL-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 40, v1
+; GFX6-GISEL-NEXT:    buffer_store_dword v3, v18, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt expcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 24, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
+; GFX6-GISEL-NEXT:    buffer_store_dword v5, v19, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt expcnt(0)
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 32, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 44, v1
+; GFX6-GISEL-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v7, v17, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v9, v18, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v13, v6, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 48, v1
+; GFX6-GISEL-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 52, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 56, v1
+; GFX6-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
+; GFX6-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX6-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-SDAG-LABEL: freeze_v16p5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v12, vcc, 28, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v13, vcc, 24, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v4, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v15, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v16, vcc, 32, v0
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; GFX7-SDAG-NEXT:    buffer_load_dword v16, v16, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 4, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 8, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 12, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT:    buffer_store_dword v7, v18, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT:    buffer_store_dword v8, v17, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 16, v1
+; GFX7-SDAG-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 40, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v17, vcc, 20, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v7, vcc, 24, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v6, vcc, 32, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 44, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-SDAG-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-SDAG-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v13, v7, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v12, v18, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-SDAG-NEXT:    buffer_store_dword v16, v6, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v5, vcc, 48, v1
+; GFX7-SDAG-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 52, v1
+; GFX7-SDAG-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 56, v1
+; GFX7-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
+; GFX7-SDAG-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: freeze_v16p5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v15, vcc, 52, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v15, v15, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v16, v16, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; GFX7-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, 4, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v18, vcc, 8, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v19, vcc, 12, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 16, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, 20, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-GISEL-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 40, v1
+; GFX7-GISEL-NEXT:    buffer_store_dword v3, v18, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 24, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v18, vcc, 28, v1
+; GFX7-GISEL-NEXT:    buffer_store_dword v5, v19, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 32, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v19, vcc, 36, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 44, v1
+; GFX7-GISEL-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v7, v17, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v9, v18, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v13, v6, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 48, v1
+; GFX7-GISEL-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 52, v1
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 56, v1
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 60, v1
+; GFX7-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: freeze_v16p5:
+; GFX8-GISEL:       ; %bb.0:
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 8, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v5, vcc, 12, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 16, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 20, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 24, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v9, vcc, 28, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 32, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v11, vcc, 36, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 40, v0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v13, vcc, 44, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v5, v5, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v6, v6, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v7, v7, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v8, v8, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v9, v9, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v10, v10, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v11, v11, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v12, v12, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_load_dword v13, v13, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 48, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v14, v14, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v15, vcc, 52, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v15, v15, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 56, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v16, v16, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
+; GFX8-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, 4, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 8, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v19, vcc, 12, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v2, v17, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 16, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, 20, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(12)
+; GFX8-GISEL-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 40, v1
+; GFX8-GISEL-NEXT:    buffer_store_dword v3, v18, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, 24, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v18, vcc, 28, v1
+; GFX8-GISEL-NEXT:    buffer_store_dword v5, v19, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v5, vcc, 32, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v19, vcc, 36, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 44, v1
+; GFX8-GISEL-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v7, v17, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v9, v18, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v11, v19, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v13, v6, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 48, v1
+; GFX8-GISEL-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 52, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 56, v1
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 60, v1
+; GFX8-GISEL-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: freeze_v16p5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_load_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_load_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_load_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_load_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_load_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_load_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_load_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_load_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: freeze_v16p5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3817,3 +12028,5 @@ define void @freeze_v16p5(ptr addrspace(5) %ptra, ptr addrspace(5) %ptrb) {
   store <16 x ptr addrspace(5)> %freeze, ptr addrspace(5) %ptrb
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX8-SDAG: {{.*}}

>From 6acfd7a85a877d28ebb84f110c946f3734c0d2c4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 19 Mar 2025 07:28:12 +0700
Subject: [PATCH 12/19] Fix losing the actual freeze

---
 .../CodeGen/SelectionDAG/LegalizeFloatTypes.cpp  | 16 +++++++++++++++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h    |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index ba08de71bf9c1..61eb70ef6c5d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2757,9 +2757,11 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
       report_fatal_error("Do not know how to promote this operator's result!");
 
     case ISD::BITCAST:
-    case ISD::FREEZE:
       R = PromoteFloatRes_BITCAST(N);
       break;
+    case ISD::FREEZE:
+      R = PromoteFloatRes_FREEZE(N);
+      break;
     case ISD::ConstantFP: R = PromoteFloatRes_ConstantFP(N); break;
     case ISD::EXTRACT_VECTOR_ELT:
                           R = PromoteFloatRes_EXTRACT_VECTOR_ELT(N); break;
@@ -2872,6 +2874,18 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_BITCAST(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, Cast);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_FREEZE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  // Input type isn't guaranteed to be a scalar int so bitcast if not. The
+  // bitcast will be legalized further if necessary.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(),
+                              N->getOperand(0).getValueType().getSizeInBits());
+  SDValue Cast = DAG.getBitcast(IVT, N->getOperand(0));
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT,
+                     DAG.getFreeze(Cast));
+}
+
 SDValue DAGTypeLegalizer::PromoteFloatRes_ConstantFP(SDNode *N) {
   ConstantFPSDNode *CFPNode = cast<ConstantFPSDNode>(N);
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 50247cebb91b1..720393158aa5e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -756,6 +756,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   void PromoteFloatResult(SDNode *N, unsigned ResNo);
   SDValue PromoteFloatRes_BITCAST(SDNode *N);
+  SDValue PromoteFloatRes_FREEZE(SDNode *N);
   SDValue PromoteFloatRes_BinOp(SDNode *N);
   SDValue PromoteFloatRes_UnaryWithTwoFPResults(SDNode *N);
   SDValue PromoteFloatRes_ConstantFP(SDNode *N);

>From 3d3d5fbecc1b4ffe9857ba6844035fd7ce3b1c77 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 19 Mar 2025 08:11:15 -0400
Subject: [PATCH 13/19] Adjust expand-fp pass DEBUG_TYPE

---
 llvm/lib/CodeGen/ExpandFp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index bc827172b3be9..b69e384074814 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -36,7 +36,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
-#define DEBUG_TYPE "expand-large-fp-convert"
+#define DEBUG_TYPE "expand-fp"
 
 using namespace llvm;
 

>From 22ba910c274599404a55265a75b42ecf7cab1afa Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 19 Mar 2025 11:37:35 -0400
Subject: [PATCH 14/19] Use AssumptionCache

... only if optimizations are enabled. Disable Value Tracking analysis
if optimizations are disabled.
---
 llvm/include/llvm/CodeGen/Passes.h    |     4 +-
 llvm/lib/CodeGen/ExpandFp.cpp         |    60 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp |     2 +-
 llvm/test/CodeGen/AMDGPU/frem.ll      | 17176 ++++++++++++++++++++----
 llvm/test/CodeGen/X86/opt-pipeline.ll |     2 +-
 5 files changed, 14375 insertions(+), 2869 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index e5cb028b25dd9..06f04c01ba25b 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -528,8 +528,8 @@ namespace llvm {
   // Expands large div/rem instructions.
   FunctionPass *createExpandLargeDivRemPass();
 
-  // Expands large div/rem instructions.
-  FunctionPass *createExpandFpPass();
+  // Expands floating point instructions.
+  FunctionPass *createExpandFpPass(CodeGenOptLevel);
 
   // This pass expands memcmp() to load/stores.
   FunctionPass *createExpandMemCmpLegacyPass();
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index b69e384074814..fb8e333ab7a56 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/CodeGen/ExpandFp.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/SimplifyQuery.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -35,6 +36,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <optional>
 
 #define DEBUG_TYPE "expand-fp"
 
@@ -95,7 +97,7 @@ class FRemExpander {
   /// must match the type for which the class instance has been
   /// created. The code will be generated at the insertion point of \p
   /// B and the insertion point will be reset at exit.
-  Value *buildFRem(Value *X, Value *Y, SimplifyQuery &SQ) const;
+  Value *buildFRem(Value *X, Value *Y, std::optional<SimplifyQuery> &SQ) const;
 
 private:
   FRemExpander(IRBuilder<> &B, Type *FremTy, short Bits, unsigned long Signbit,
@@ -252,7 +254,8 @@ class FRemExpander {
   /// Return a value that is NaN if one of the corner cases concerning
   /// the inputs \p X and \p Y is detected, and \p Ret otherwise.
   Value *handleInputCornerCases(Value *Ret, Value *X,
-                                Value *Y, SimplifyQuery &SQ) const {
+                                Value *Y, std::optional<SimplifyQuery> &SQ,
+				bool NoInfs) const {
     // Build:
     //   ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
     //   bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
@@ -262,9 +265,10 @@ class FRemExpander {
     Ret = B.CreateSelect(B.CreateFCmpOEQ(Y, ConstantFP::get(FremTy, 0.0)), Nan,
                          Ret);
     FPClassTest NotNan = FPClassTest::fcInf | FPClassTest::fcFinite;
-    Value *YNotNan =
-        isKnownNeverNaN(Y, 0, SQ) ? B.getTrue() : B.createIsFPClass(Y, NotNan);
-    Value *XFinite = isKnownNeverInfinity(X, 0, SQ)
+    Value *YNotNan = SQ && isKnownNeverNaN(Y, 0, *SQ)
+                         ? B.getTrue()
+                         : B.createIsFPClass(Y, NotNan);
+    Value *XFinite = NoInfs || (SQ && isKnownNeverInfinity(X, 0, *SQ))
                          ? B.getTrue()
                          : B.createIsFPClass(X, FPClassTest::fcFinite);
     Value *C = B.CreateLogicalAnd(YNotNan, XFinite);
@@ -274,7 +278,8 @@ class FRemExpander {
   }
 };
 
-  Value *FRemExpander::buildFRem(Value *X, Value *Y, SimplifyQuery &SQ) const {
+Value *FRemExpander::buildFRem(Value *X, Value *Y,
+                               std::optional<SimplifyQuery> &SQ) const {
   assert(X->getType() == FremTy && Y->getType() == FremTy);
 
   FastMathFlags FMF = B.getFastMathFlags();
@@ -300,7 +305,7 @@ class FRemExpander {
   // We would return NaN in all corner cases handled here.
   // Hence, if NaNs are excluded, keep the result as it is.
   if (!FMF.noNaNs())
-    Ret = handleInputCornerCases(Ret, X, Y, SQ);
+    Ret = handleInputCornerCases(Ret, X, Y, SQ, FMF.noInfs());
 
   Function *Fun = B.GetInsertBlock()->getParent();
   auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
@@ -358,7 +363,7 @@ static bool shouldSkipExpandFRem(BinaryOperator &I) {
          isConstOrConstSelectOp(I.getOperand(1));
 }
 
-static bool expandFRem(BinaryOperator &I, SimplifyQuery &SQ) {
+static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
   LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
   if (shouldSkipExpandFRem(I)) {
     LLVM_DEBUG(
@@ -999,7 +1004,8 @@ static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
   return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
 }
 
-static bool runImpl(Function &F, const TargetLowering &TLI) {
+static bool runImpl(Function &F, const TargetLowering &TLI,
+                    AssumptionCache *AC) {
   SmallVector<Instruction *, 4> Replace;
   SmallVector<Instruction *, 4> ReplaceVector;
   bool Modified = false;
@@ -1071,7 +1077,16 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
   while (!Replace.empty()) {
     Instruction *I = Replace.pop_back_val();
     if (I->getOpcode() == Instruction::FRem) {
-      auto SQ = SimplifyQuery{I->getModule()->getDataLayout(), I};
+      auto SQ = [&]() -> std::optional<SimplifyQuery> {
+        if (AC) {
+          auto Res = std::make_optional<SimplifyQuery>(
+              I->getModule()->getDataLayout(), I);
+          Res->AC = AC;
+          return Res;
+        }
+        return {};
+      }();
+
       expandFRem(cast<BinaryOperator>(*I), SQ);
     } else if (I->getOpcode() == Instruction::FPToUI ||
                I->getOpcode() == Instruction::FPToSI) {
@@ -1086,21 +1101,30 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
 
 namespace {
 class ExpandFpLegacyPass : public FunctionPass {
+    CodeGenOptLevel OptLevel;
 public:
   static char ID;
 
-  ExpandFpLegacyPass() : FunctionPass(ID) {
+  ExpandFpLegacyPass(CodeGenOptLevel OptLevel)
+      : FunctionPass(ID), OptLevel(OptLevel) {
     initializeExpandFpLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
+  ExpandFpLegacyPass() : ExpandFpLegacyPass(CodeGenOptLevel::None) {};
+
   bool runOnFunction(Function &F) override {
     auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
     auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
-    return runImpl(F, *TLI);
+    AssumptionCache *AC = nullptr;
+    if (OptLevel != CodeGenOptLevel::None)
+      AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    return runImpl(F, *TLI, AC);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetPassConfig>();
+    if (OptLevel != CodeGenOptLevel::None)
+      AU.addRequired<AssumptionCacheTracker>();
     AU.addPreserved<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
@@ -1109,8 +1133,12 @@ class ExpandFpLegacyPass : public FunctionPass {
 
 PreservedAnalyses ExpandFpPass::run(Function &F, FunctionAnalysisManager &FAM) {
   const TargetSubtargetInfo *STI = TM->getSubtargetImpl(F);
-  return runImpl(F, *STI->getTargetLowering()) ? PreservedAnalyses::none()
-                                               : PreservedAnalyses::all();
+  auto &TLI = *STI->getTargetLowering();
+  AssumptionCache *AC = nullptr;
+  if (TM->getOptLevel() != CodeGenOptLevel::None)
+    AC = &FAM.getResult<AssumptionAnalysis>(F);
+  return runImpl(F, TLI, AC) ? PreservedAnalyses::none()
+                             : PreservedAnalyses::all();
 }
 
 char ExpandFpLegacyPass::ID = 0;
@@ -1118,4 +1146,6 @@ INITIALIZE_PASS_BEGIN(ExpandFpLegacyPass, "expand-fp",
                       "Expand certain fp instructions", false, false)
 INITIALIZE_PASS_END(ExpandFpLegacyPass, "expand-fp", "Expand fp", false, false)
 
-FunctionPass *llvm::createExpandFpPass() { return new ExpandFpLegacyPass(); }
+FunctionPass *llvm::createExpandFpPass(CodeGenOptLevel OptLevel) {
+  return new ExpandFpLegacyPass(OptLevel);
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index f788ec5ecb15b..3ff13265a63ea 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1070,7 +1070,7 @@ bool TargetPassConfig::addISelPasses() {
   PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
   addPass(createPreISelIntrinsicLoweringPass());
   addPass(createExpandLargeDivRemPass());
-  addPass(createExpandFpPass());
+  addPass(createExpandFpPass(getOptLevel()));
   addIRPasses();
   addCodeGenPrepare();
   addPassesToHandleExceptions();
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 5febd5256e794..07e4d5b18406c 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -10,82 +10,241 @@
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
-; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
-; SI-NEXT:    v_rcp_f32_e32 v4, v3
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1|
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SI-NEXT:    s_cbranch_vccz .LBB0_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_bfi_b32 v5, s0, 0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB0_3
+; SI-NEXT:    s_branch .LBB0_8
+; SI-NEXT:  .LBB0_2:
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB0_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v4, v3
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v4
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v3, v3, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v4, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v2
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v2, v4, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v2, 1.0
+; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v6, v5
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
-; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
-; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
-; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
+; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
+; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
+; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
+; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
+; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
+; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB0_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB0_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_mul_f32_e32 v3, v5, v4
+; SI-NEXT:    v_rndne_f32_e32 v3, v3
+; SI-NEXT:    v_fma_f32 v3, -v3, v2, v5
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_add_f32_e32 v6, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v3, v3, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB0_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:  .LBB0_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v3, v3, s1
+; SI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; SI-NEXT:    v_rndne_f32_e32 v4, v4
+; SI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_add_f32_e32 v2, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0x8000, v3
+; SI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; SI-NEXT:  .LBB0_8: ; %Flow19
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; SI-NEXT:    s_movk_i32 s0, 0x7c01
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    s_movk_i32 s2, 0x7c00
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s2, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s11, 0xf000
-; CI-NEXT:    s_mov_b32 s10, -1
-; CI-NEXT:    s_mov_b32 s6, s10
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; CI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
+; CI-NEXT:    s_brev_b32 s0, -2
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
-; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
-; CI-NEXT:    v_rcp_f32_e32 v4, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
+; CI-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v0
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1|
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB0_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_bfi_b32 v5, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v2
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CI-NEXT:    s_cbranch_execz .LBB0_3
+; CI-NEXT:    s_branch .LBB0_8
+; CI-NEXT:  .LBB0_2:
+; CI-NEXT:    ; implicit-def: $vgpr3
+; CI-NEXT:  .LBB0_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; CI-NEXT:    v_ldexp_f32_e64 v5, v3, 11
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; CI-NEXT:    v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT:    v_div_scale_f32 v9, s[0:1], v3, v3, 1.0
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; CI-NEXT:    v_not_b32_e32 v4, v2
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v10, v9
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
-; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; CI-NEXT:    v_mul_f32_e32 v11, v6, v10
+; CI-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; CI-NEXT:    v_fma_f32 v6, -v9, v11, v6
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v4
+; CI-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB0_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v4, vcc, v7, v8
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 11, v4
+; CI-NEXT:  .LBB0_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v8, v5, v3
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB0_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v5, v7
+; CI-NEXT:  .LBB0_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -10, v4
+; CI-NEXT:    v_ldexp_f32_e32 v4, v5, v4
+; CI-NEXT:    v_mul_f32_e32 v5, v4, v6
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_and_b32_e32 v3, 0x8000, v3
+; CI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:  .LBB0_8: ; %Flow19
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_movk_i32 s0, 0x7c01
+; CI-NEXT:    s_movk_i32 s2, 0x7c00
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s0, v1
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s2, v0
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
@@ -95,60 +254,191 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 8
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_ushort v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s4, 8
+; VI-NEXT:    s_addc_u32 s3, s5, 0
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_load_ushort v1, v[1:2]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; VI-NEXT:    v_rcp_f32_e32 v6, v5
-; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
-; VI-NEXT:    v_mad_f32 v8, -v5, v7, v3
-; VI-NEXT:    v_mac_f32_e32 v7, v8, v6
-; VI-NEXT:    v_mad_f32 v3, -v5, v7, v3
-; VI-NEXT:    v_mul_f32_e32 v3, v3, v6
-; VI-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
-; VI-NEXT:    v_add_f32_e32 v3, v3, v7
-; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
-; VI-NEXT:    v_trunc_f16_e32 v3, v3
-; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; VI-NEXT:    s_cbranch_vccz .LBB0_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_movk_i32 s2, 0x7fff
+; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; VI-NEXT:    s_cbranch_execz .LBB0_3
+; VI-NEXT:    s_branch .LBB0_8
+; VI-NEXT:  .LBB0_2:
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB0_3: ; %frem.compute
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT:    v_ldexp_f32 v3, v4, 1
+; VI-NEXT:    v_div_scale_f32 v9, s[2:3], v3, v3, 1.0
+; VI-NEXT:    v_ldexp_f32 v5, v2, 11
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_not_b32_e32 v4, v2
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
+; VI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v10, v9
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
+; VI-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; VI-NEXT:    v_fma_f32 v6, -v9, v11, v6
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v4
+; VI-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB0_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v7, v8
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 11, v4
+; VI-NEXT:  .LBB0_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v8, v5, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; VI-NEXT:    v_ldexp_f32 v5, v5, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB0_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v5, v7
+; VI-NEXT:  .LBB0_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -10, v4
+; VI-NEXT:    v_ldexp_f32 v4, v5, v4
+; VI-NEXT:    v_mul_f32_e32 v5, v4, v6
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
+; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; VI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; VI-NEXT:  .LBB0_8: ; %Flow19
+; VI-NEXT:    v_mov_b32_e32 v5, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v1
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v1, v5
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v0, v1
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    flat_store_short v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
+; GFX9-NEXT:    global_load_ushort v0, v2, s[10:11]
+; GFX9-NEXT:    global_load_ushort v1, v2, s[0:1] offset:8
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mac_f32_e32 v3, v5, v4
-; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; GFX9-NEXT:    s_cbranch_vccz .LBB0_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_3
+; GFX9-NEXT:    s_branch .LBB0_8
+; GFX9-NEXT:  .LBB0_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  .LBB0_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; GFX9-NEXT:    v_ldexp_f32 v3, v4, 1
+; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v3, v3, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v5, v2, 11
+; GFX9-NEXT:    v_add_u32_e32 v2, -1, v8
+; GFX9-NEXT:    v_not_b32_e32 v4, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
+; GFX9-NEXT:    v_mul_f32_e32 v11, v6, v10
+; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
+; GFX9-NEXT:    v_fma_f32 v6, -v9, v11, v6
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v4
+; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB0_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v4, v7, v8
+; GFX9-NEXT:    v_add_u32_e32 v4, 11, v4
+; GFX9-NEXT:  .LBB0_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v6
+; GFX9-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX9-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_add_f32_e32 v8, v5, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, -11, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; GFX9-NEXT:    v_ldexp_f32 v5, v5, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB0_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-NEXT:  .LBB0_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v4, -10, v4
+; GFX9-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v6
+; GFX9-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX9-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX9-NEXT:  .LBB0_8: ; %Flow19
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[0:1], v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[2:3], v0, v1
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    global_store_short v3, v0, s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_f16:
@@ -156,28 +446,93 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
+; GFX10-NEXT:    global_load_ushort v0, v2, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v2, s[6:7] offset:8
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
-; GFX10-NEXT:    v_mad_f32 v7, -v4, v6, v3
-; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v5
-; GFX10-NEXT:    v_mad_f32 v3, -v4, v6, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v2, |v1|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    s_cbranch_vccz .LBB0_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v3, 0x7fff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB0_3
+; GFX10-NEXT:    s_branch .LBB0_8
+; GFX10-NEXT:  .LBB0_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr3
+; GFX10-NEXT:  .LBB0_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v5, v4
+; GFX10-NEXT:    v_ldexp_f32 v4, v3, 11
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT:    v_div_scale_f32 v7, s4, v3, v3, 1.0
+; GFX10-NEXT:    v_not_b32_e32 v6, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX10-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB0_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB0_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v4
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v4, -v4, v3, v7
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB0_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v7
+; GFX10-NEXT:  .LBB0_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX10-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX10-NEXT:  .LBB0_8: ; %Flow19
+; GFX10-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_f16:
@@ -185,35 +540,115 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX11-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v2, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    s_cbranch_vccz .LBB0_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB0_3
+; GFX11-NEXT:    s_branch .LBB0_8
+; GFX11-NEXT:  .LBB0_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr3
+; GFX11-NEXT:  .LBB0_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v4, v3, 11
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX11-NEXT:    v_not_b32_e32 v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX11-NEXT:    v_mul_f32_e32 v9, v5, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB0_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB0_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v7, v4
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v4, v3, v7
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB0_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v6, s2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v7
+; GFX11-NEXT:  .LBB0_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX11-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
 ; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX11-NEXT:  .LBB0_8: ; %Flow19
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f16:
@@ -221,36 +656,123 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
 ; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s4, s3, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s3, s2
+; GFX1150-NEXT:    s_cvt_f32_f16 s2, s4
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s3, s2
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    s_cmp_eq_f32 s3, s2
+; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB0_3
+; GFX1150-NEXT:    s_branch .LBB0_8
+; GFX1150-NEXT:  .LBB0_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB0_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v3, s2
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s3
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v5, s3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    v_ldexp_f32 v4, v2, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX1150-NEXT:    v_not_b32_e32 v6, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX1150-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB0_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s3, s2
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 11
+; GFX1150-NEXT:  .LBB0_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1150-NEXT:    s_add_i32 s2, s2, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 11
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB0_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v6, s2
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-NEXT:  .LBB0_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX1150-NEXT:  .LBB0_8: ; %Flow19
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7e00, v2
+; GFX1150-NEXT:    global_store_b16 v3, v0, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -269,26 +791,107 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; SI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; SI-NEXT:    v_cmp_le_f32_e64 s[4:5], |v0|, v1
+; SI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v0
+; SI-NEXT:    s_cbranch_vccz .LBB1_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_branch .LBB1_8
+; SI-NEXT:  .LBB1_2:
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB1_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s5, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s5
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v3, v2
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v3
+; SI-NEXT:    s_cselect_b32 s4, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s5
+; SI-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v1
+; SI-NEXT:    s_cselect_b32 s5, s2, 0
+; SI-NEXT:    s_add_i32 s2, s5, -1
+; SI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
+; SI-NEXT:    s_not_b32 s3, s2
+; SI-NEXT:    s_add_i32 s3, s3, s4
+; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v1, 1.0
+; SI-NEXT:    v_div_scale_f32 v4, s[6:7], v1, v1, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v5, v4
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
+; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
+; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
+; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; SI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s3, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB1_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s3, s4, s5
+; SI-NEXT:    s_add_i32 s3, s3, 11
+; SI-NEXT:  .LBB1_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mul_f32_e32 v2, v4, v3
+; SI-NEXT:    v_rndne_f32_e32 v2, v2
+; SI-NEXT:    v_fma_f32 v2, -v2, v1, v4
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_add_f32_e32 v5, v2, v1
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, 11
+; SI-NEXT:    s_add_i32 s3, s3, -11
+; SI-NEXT:    s_cmp_gt_i32 s3, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB1_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:  .LBB1_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s3, s3, -10
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, s3
+; SI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; SI-NEXT:    v_rndne_f32_e32 v3, v3
+; SI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_add_f32_e32 v1, v2, v1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v1, v1, s2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; SI-NEXT:    v_xor_b32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; SI-NEXT:  .LBB1_8: ; %Flow19
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: fast_frem_f16:
@@ -298,26 +901,94 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    s_mov_b32 s11, 0xf000
 ; CI-NEXT:    s_mov_b32 s10, -1
 ; CI-NEXT:    s_mov_b32 s6, s10
-; CI-NEXT:    s_mov_b32 s7, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
+; CI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; CI-NEXT:    s_mov_b32 s7, s11
 ; CI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; CI-NEXT:    s_brev_b32 s2, -2
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_rcp_f32_e32 v2, v1
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; CI-NEXT:    v_cmp_le_f32_e64 s[4:5], |v0|, v1
+; CI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CI-NEXT:    s_cbranch_vccz .LBB1_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT:    s_cbranch_execz .LBB1_3
+; CI-NEXT:    s_branch .LBB1_8
+; CI-NEXT:  .LBB1_2:
+; CI-NEXT:    ; implicit-def: $vgpr3
+; CI-NEXT:  .LBB1_3: ; %frem.compute
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v2
+; CI-NEXT:    v_frexp_mant_f32_e32 v2, v2
+; CI-NEXT:    v_ldexp_f32_e64 v4, v2, 11
+; CI-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; CI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT:    v_div_scale_f32 v8, s[2:3], v2, v2, 1.0
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT:    v_not_b32_e32 v3, v1
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v9, v8
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
+; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
+; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v3
+; CI-NEXT:    v_div_fixup_f32 v5, v5, v2, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB1_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v6, v7
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 11, v3
+; CI-NEXT:  .LBB1_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v6, v5
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v2, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v7, v4, v2
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB1_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v4, v6
+; CI-NEXT:  .LBB1_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT:    v_mul_f32_e32 v4, v3, v5
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_add_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; CI-NEXT:    v_xor_b32_e32 v0, v0, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; CI-NEXT:  .LBB1_8: ; %Flow19
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fast_frem_f16:
@@ -325,38 +996,173 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s4, 8
+; VI-NEXT:    s_addc_u32 s3, s5, 0
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_load_ushort v1, v[1:2]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cvt_f32_f16_e64 v2, |v0|
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; VI-NEXT:    v_cmp_le_f32_e32 vcc, v2, v1
+; VI-NEXT:    s_cbranch_vccz .LBB1_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_movk_i32 s2, 0x7fff
+; VI-NEXT:    v_bfi_b32 v3, s2, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; VI-NEXT:    s_cbranch_execz .LBB1_3
+; VI-NEXT:    s_branch .LBB1_8
+; VI-NEXT:  .LBB1_2:
+; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:  .LBB1_3: ; %frem.compute
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v2
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v2
+; VI-NEXT:    v_ldexp_f32 v4, v2, 11
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; VI-NEXT:    v_ldexp_f32 v2, v2, 1
+; VI-NEXT:    v_div_scale_f32 v8, s[2:3], v2, v2, 1.0
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT:    v_not_b32_e32 v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
+; VI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v9, v8
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; VI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v5, v9
+; VI-NEXT:    v_fma_f32 v11, -v8, v10, v5
+; VI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; VI-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v3
+; VI-NEXT:    v_div_fixup_f32 v5, v5, v2, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB1_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v6, v7
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 11, v3
+; VI-NEXT:  .LBB1_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v6, v5
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v2, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v7, v4, v2
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT:    v_ldexp_f32 v4, v4, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB1_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:  .LBB1_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT:    v_ldexp_f32 v3, v4, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v3, v5
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    v_ldexp_f32 v1, v2, v1
+; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    v_xor_b32_e32 v3, v0, v1
+; VI-NEXT:  .LBB1_8: ; %Flow19
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_ushort v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_rcp_f16_e32 v3, v2
-; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
-; VI-NEXT:    v_trunc_f16_e32 v3, v3
-; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    flat_store_short v[0:1], v3
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: fast_frem_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
+; GFX9-NEXT:    global_load_ushort v0, v1, s[2:3]
+; GFX9-NEXT:    global_load_ushort v3, v1, s[6:7] offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, |v0|
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, |v3|
+; GFX9-NEXT:    v_cmp_le_f32_e32 vcc, v2, v1
+; GFX9-NEXT:    s_cbranch_vccz .LBB1_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v3, s2, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v2, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB1_3
+; GFX9-NEXT:    s_branch .LBB1_8
+; GFX9-NEXT:  .LBB1_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr3
+; GFX9-NEXT:  .LBB1_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v6, v2
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v2, v2
+; GFX9-NEXT:    v_ldexp_f32 v4, v2, 11
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX9-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX9-NEXT:    v_div_scale_f32 v8, s[2:3], v2, v2, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v7
+; GFX9-NEXT:    v_not_b32_e32 v3, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v9, v8
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
+; GFX9-NEXT:    v_mul_f32_e32 v10, v5, v9
+; GFX9-NEXT:    v_fma_f32 v11, -v8, v10, v5
+; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
+; GFX9-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v3
+; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v2, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB1_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v3, v6, v7
+; GFX9-NEXT:    v_add_u32_e32 v3, 11, v3
+; GFX9-NEXT:  .LBB1_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v6, v5
+; GFX9-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX9-NEXT:    v_fma_f32 v4, -v4, v2, v6
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_add_f32_e32 v7, v4, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -11, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v3
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB1_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:  .LBB1_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v3, -10, v3
+; GFX9-NEXT:    v_ldexp_f32 v3, v4, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v5
+; GFX9-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX9-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v3, v0, v1
+; GFX9-NEXT:  .LBB1_8: ; %Flow19
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fast_frem_f16:
@@ -364,17 +1170,87 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
+; GFX10-NEXT:    global_load_ushort v0, v1, s[2:3]
+; GFX10-NEXT:    global_load_ushort v3, v1, s[6:7] offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v2, |v0|
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v1, |v3|
+; GFX10-NEXT:    v_cmp_le_f32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    s_cbranch_vccz .LBB1_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v3, 0x7fff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB1_3
+; GFX10-NEXT:    s_branch .LBB1_8
+; GFX10-NEXT:  .LBB1_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr3
+; GFX10-NEXT:  .LBB1_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v2
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX10-NEXT:    v_ldexp_f32 v3, v2, 11
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX10-NEXT:    v_div_scale_f32 v6, s4, v2, v2, 1.0
+; GFX10-NEXT:    v_not_b32_e32 v5, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX10-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v5
+; GFX10-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB1_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB1_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX10-NEXT:    v_fma_f32 v3, -v3, v2, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v5, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v6
+; GFX10-NEXT:  .LBB1_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, -10, v5
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT:    v_xor_b32_e32 v3, v0, v1
+; GFX10-NEXT:  .LBB1_8: ; %Flow19
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_store_short v0, v3, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fast_frem_f16:
@@ -382,19 +1258,108 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX11-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v2, |v0|
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, v2, v1
+; GFX11-NEXT:    s_cbranch_vccz .LBB1_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB1_3
+; GFX11-NEXT:    s_branch .LBB1_8
+; GFX11-NEXT:  .LBB1_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr3
+; GFX11-NEXT:  .LBB1_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v4, v2
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    v_ldexp_f32 v3, v2, 11
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX11-NEXT:    v_not_b32_e32 v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX11-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX11-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB1_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB1_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX11-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v3, -v3, v2, v6
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    v_add_f32_e32 v5, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v5, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, v6
+; GFX11-NEXT:  .LBB1_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, -10, v5
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_xor_b32_e32 v3, v0, v1
+; GFX11-NEXT:  .LBB1_8: ; %Flow19
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b16 v0, v3, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f16:
@@ -402,19 +1367,116 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s4, s3, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s3, s2
+; GFX1150-NEXT:    s_cvt_f32_f16 s2, s4
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_le_f32 s3, s2
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB1_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    s_cmp_eq_f32 s3, s2
+; GFX1150-NEXT:    v_bfi_b32 v1, 0x7fff, 0, v0
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB1_3
+; GFX1150-NEXT:    s_branch .LBB1_8
+; GFX1150-NEXT:  .LBB1_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr1
+; GFX1150-NEXT:  .LBB1_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s2
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v1, s3
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, s3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX1150-NEXT:    v_ldexp_f32 v3, v1, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v1, s2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v5, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX1150-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1150-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v5
+; GFX1150-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB1_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s3, s2
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 11
+; GFX1150-NEXT:  .LBB1_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1150-NEXT:    s_add_i32 s2, s2, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fma_f32 v3, v3, v2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v5, s2
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v6
+; GFX1150-NEXT:  .LBB1_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, -10, v5
+; GFX1150-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v1, v0, v1
+; GFX1150-NEXT:  .LBB1_8: ; %Flow19
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
@@ -429,58 +1491,213 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: unsafe_frem_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v4|, |v1|
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
+; SI-NEXT:    s_cbranch_vccz .LBB2_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_bfi_b32 v4, s0, 0, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB2_3
+; SI-NEXT:    s_branch .LBB2_8
+; SI-NEXT:  .LBB2_2:
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB2_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v4, v3
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v4
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v4, v3, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v2
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v2, v3, 1
+; SI-NEXT:    v_rcp_f32_e32 v3, v2
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB2_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB2_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v5, v4
+; SI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; SI-NEXT:    v_rndne_f32_e32 v4, v4
+; SI-NEXT:    v_fma_f32 v4, -v4, v2, v5
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; SI-NEXT:    v_add_f32_e32 v6, v4, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB2_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v4, v5
+; SI-NEXT:  .LBB2_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, s1
+; SI-NEXT:    v_mul_f32_e32 v3, v4, v3
+; SI-NEXT:    v_rndne_f32_e32 v3, v3
+; SI-NEXT:    v_fma_f32 v3, -v3, v2, v4
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_add_f32_e32 v2, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; SI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; SI-NEXT:  .LBB2_8: ; %Flow19
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; SI-NEXT:    s_movk_i32 s0, 0x7c01
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    s_movk_i32 s2, 0x7c00
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s2, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: unsafe_frem_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s11, 0xf000
-; CI-NEXT:    s_mov_b32 s10, -1
-; CI-NEXT:    s_mov_b32 s6, s10
-; CI-NEXT:    s_mov_b32 s7, s11
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:8
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_rcp_f32_e32 v2, v1
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; CI-NEXT:    s_mov_b32 s3, s7
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:8
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v3
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v3|, |v1|
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB2_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; CI-NEXT:    v_bfi_b32 v3, s0, 0, v3
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v2
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT:    s_cbranch_execz .LBB2_3
+; CI-NEXT:    s_branch .LBB2_8
+; CI-NEXT:  .LBB2_2:
+; CI-NEXT:    ; implicit-def: $vgpr3
+; CI-NEXT:  .LBB2_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; CI-NEXT:    v_ldexp_f32_e64 v6, v3, 11
+; CI-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT:    v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; CI-NEXT:    v_rcp_f32_e32 v4, v3
+; CI-NEXT:    v_not_b32_e32 v5, v2
+; CI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v5
+; CI-NEXT:    s_cbranch_vccnz .LBB2_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, v7, v8
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 11, v5
+; CI-NEXT:  .LBB2_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v6
+; CI-NEXT:    v_mul_f32_e32 v6, v7, v4
+; CI-NEXT:    v_rndne_f32_e32 v6, v6
+; CI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT:    v_add_f32_e32 v8, v6, v3
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v5
+; CI-NEXT:    v_ldexp_f32_e64 v6, v6, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB2_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v6, v7
+; CI-NEXT:  .LBB2_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -10, v5
+; CI-NEXT:    v_ldexp_f32_e32 v5, v6, v5
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v4
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; CI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:  .LBB2_8: ; %Flow19
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    s_movk_i32 s0, 0x7c01
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT:    s_movk_i32 s2, 0x7c00
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s0, v1
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s2, v0
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
@@ -490,38 +1707,167 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 8
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_ushort v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s4, 8
+; VI-NEXT:    s_addc_u32 s3, s5, 0
+; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_load_ushort v1, v[1:2]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_rcp_f16_e32 v3, v2
-; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
-; VI-NEXT:    v_trunc_f16_e32 v3, v3
-; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
-; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; VI-NEXT:    s_cbranch_vccz .LBB2_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_movk_i32 s2, 0x7fff
+; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; VI-NEXT:    s_cbranch_execz .LBB2_3
+; VI-NEXT:    s_branch .LBB2_8
+; VI-NEXT:  .LBB2_2:
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB2_3: ; %frem.compute
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT:    v_ldexp_f32 v6, v2, 11
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_ldexp_f32 v3, v4, 1
+; VI-NEXT:    v_rcp_f32_e32 v4, v3
+; VI-NEXT:    v_not_b32_e32 v5, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v5
+; VI-NEXT:    s_cbranch_vccnz .LBB2_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v7, v8
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 11, v5
+; VI-NEXT:  .LBB2_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:    v_mul_f32_e32 v6, v7, v4
+; VI-NEXT:    v_rndne_f32_e32 v6, v6
+; VI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT:    v_add_f32_e32 v8, v6, v3
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v5
+; VI-NEXT:    v_ldexp_f32 v6, v6, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB2_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v6, v7
+; VI-NEXT:  .LBB2_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -10, v5
+; VI-NEXT:    v_ldexp_f32 v5, v6, v5
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v4
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
+; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; VI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; VI-NEXT:  .LBB2_8: ; %Flow19
+; VI-NEXT:    v_mov_b32_e32 v5, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v1
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v1, v5
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v0, v1
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    flat_store_short v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: unsafe_frem_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
+; GFX9-NEXT:    global_load_ushort v0, v2, s[10:11]
+; GFX9-NEXT:    global_load_ushort v1, v2, s[0:1] offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; GFX9-NEXT:    s_cbranch_vccz .LBB2_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB2_3
+; GFX9-NEXT:    s_branch .LBB2_8
+; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  .LBB2_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; GFX9-NEXT:    v_ldexp_f32 v6, v2, 11
+; GFX9-NEXT:    v_add_u32_e32 v2, -1, v8
+; GFX9-NEXT:    v_ldexp_f32 v3, v4, 1
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX9-NEXT:    v_not_b32_e32 v5, v2
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v5
+; GFX9-NEXT:    s_cbranch_vccnz .LBB2_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v5, v7, v8
+; GFX9-NEXT:    v_add_u32_e32 v5, 11, v5
+; GFX9-NEXT:  .LBB2_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v4
+; GFX9-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX9-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_add_f32_e32 v8, v6, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, -11, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v5
+; GFX9-NEXT:    v_ldexp_f32 v6, v6, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB2_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-NEXT:  .LBB2_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v5, -10, v5
+; GFX9-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX9-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX9-NEXT:  .LBB2_8: ; %Flow19
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[0:1], v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[2:3], v0, v1
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7e00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    global_store_short v3, v0, s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: unsafe_frem_f16:
@@ -529,17 +1875,81 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_ushort v2, v0, s[6:7] offset:8
+; GFX10-NEXT:    global_load_ushort v0, v2, s[2:3]
+; GFX10-NEXT:    global_load_ushort v1, v2, s[6:7] offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v2, |v1|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    s_cbranch_vccz .LBB2_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v3, 0x7fff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB2_3
+; GFX10-NEXT:    s_branch .LBB2_8
+; GFX10-NEXT:  .LBB2_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr3
+; GFX10-NEXT:  .LBB2_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v6, v4
+; GFX10-NEXT:    v_ldexp_f32 v5, v3, 11
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX10-NEXT:    v_not_b32_e32 v7, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v7
+; GFX10-NEXT:    s_cbranch_vccnz .LBB2_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB2_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v5, v6, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v5, -v5, v3, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_add_f32_e32 v7, v5, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v5, v5, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v7, s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:  .LBB2_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -10, v7
+; GFX10-NEXT:    v_ldexp_f32 v5, v5, v6
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX10-NEXT:  .LBB2_8: ; %Flow19
+; GFX10-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: unsafe_frem_f16:
@@ -547,19 +1957,100 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX11-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v2, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    s_cbranch_vccz .LBB2_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB2_3
+; GFX11-NEXT:    s_branch .LBB2_8
+; GFX11-NEXT:  .LBB2_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr3
+; GFX11-NEXT:  .LBB2_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v5, v3, 11
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v2
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX11-NEXT:    v_not_b32_e32 v7, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v7
+; GFX11-NEXT:    s_cbranch_vccnz .LBB2_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB2_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v5, v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX11-NEXT:    v_fma_f32 v5, -v5, v3, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_add_f32_e32 v7, v5, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v5, v5, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v7, s2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-NEXT:  .LBB2_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -10, v7
+; GFX11-NEXT:    v_ldexp_f32 v5, v5, v6
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX11-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v2
+; GFX11-NEXT:  .LBB2_8: ; %Flow19
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f16:
@@ -567,20 +2058,107 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_u16 v1, v1, s[4:5] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s4, s3, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s3, s2
+; GFX1150-NEXT:    s_cvt_f32_f16 s2, s4
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s3, s2
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB2_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    s_cmp_eq_f32 s3, s2
+; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB2_3
+; GFX1150-NEXT:    s_branch .LBB2_8
+; GFX1150-NEXT:  .LBB2_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB2_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s3
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v3, s2
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v6, s3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v2, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, s2
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v6
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX1150-NEXT:    v_not_b32_e32 v7, v2
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v7
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB2_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s3, s2
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 11
+; GFX1150-NEXT:  .LBB2_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v5
+; GFX1150-NEXT:    s_add_i32 s2, s2, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fma_f32 v5, v5, v3, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v7, v5, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v7, s2
+; GFX1150-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1150-NEXT:  .LBB2_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -10, v7
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, v6
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX1150-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX1150-NEXT:  .LBB2_8: ; %Flow19
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7e00, v2
+; GFX1150-NEXT:    global_store_b16 v3, v0, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -594,73 +2172,206 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
-; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
-; SI-NEXT:    v_rcp_f32_e32 v4, v3
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB3_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB3_3
+; SI-NEXT:    s_branch .LBB3_8
+; SI-NEXT:  .LBB3_2:
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB3_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v2
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v0|, v2, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v3, v2, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v1|, v2, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v4, v1
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v4
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v2, 1.0
+; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v6, v5
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
-; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
-; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
-; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
+; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
+; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
+; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
+; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
+; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
+; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB3_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB3_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_mul_f32_e32 v3, v5, v4
+; SI-NEXT:    v_rndne_f32_e32 v3, v3
+; SI-NEXT:    v_fma_f32 v3, -v3, v2, v5
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_add_f32_e32 v6, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v3, v3, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB3_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v3, v5
+; SI-NEXT:  .LBB3_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v3, v3, s1
+; SI-NEXT:    v_mul_f32_e32 v4, v3, v4
+; SI-NEXT:    v_rndne_f32_e32 v4, v4
+; SI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_add_f32_e32 v2, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, s0
+; SI-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; SI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; SI-NEXT:  .LBB3_8: ; %Flow17
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3fc
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v3
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s11, 0xf000
-; CI-NEXT:    s_mov_b32 s10, -1
-; CI-NEXT:    s_mov_b32 s6, s10
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
-; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
-; CI-NEXT:    v_rcp_f32_e32 v4, v3
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB3_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; CI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; CI-NEXT:    s_cbranch_execz .LBB3_3
+; CI-NEXT:    s_branch .LBB3_8
+; CI-NEXT:  .LBB3_2:
+; CI-NEXT:    ; implicit-def: $vgpr2
+; CI-NEXT:  .LBB3_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; CI-NEXT:    v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT:    v_div_scale_f32 v9, s[0:1], v3, v3, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; CI-NEXT:    v_ldexp_f32_e64 v5, v2, 12
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; CI-NEXT:    v_not_b32_e32 v4, v2
+; CI-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v10, v9
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
-; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
-; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
-; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
-; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
-; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
+; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; CI-NEXT:    v_mul_f32_e32 v11, v6, v10
+; CI-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; CI-NEXT:    v_fma_f32 v6, -v9, v11, v6
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v4
+; CI-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB3_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v4, vcc, v7, v8
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 12, v4
+; CI-NEXT:  .LBB3_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_add_f32_e32 v8, v5, v3
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -12, v4
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB3_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v5, v7
+; CI-NEXT:  .LBB3_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT:    v_ldexp_f32_e32 v4, v5, v4
+; CI-NEXT:    v_mul_f32_e32 v5, v4, v6
+; CI-NEXT:    v_rndne_f32_e32 v5, v5
+; CI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; CI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; CI-NEXT:  .LBB3_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v3, 0x3fc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v3
+; CI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -669,60 +2380,185 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_dword v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_dword v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s4, 16
+; VI-NEXT:    s_addc_u32 s3, s5, 0
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_load_dword v1, v[1:2]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v4
-; VI-NEXT:    v_div_scale_f32 v3, vcc, v4, v2, v4
-; VI-NEXT:    v_rcp_f32_e32 v6, v5
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1|
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB3_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; VI-NEXT:    s_cbranch_execz .LBB3_3
+; VI-NEXT:    s_branch .LBB3_8
+; VI-NEXT:  .LBB3_2:
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB3_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; VI-NEXT:    v_ldexp_f32 v3, v3, 1
+; VI-NEXT:    v_div_scale_f32 v9, s[2:3], v3, v3, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; VI-NEXT:    v_ldexp_f32 v5, v2, 12
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; VI-NEXT:    v_not_b32_e32 v4, v2
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
+; VI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v10, v9
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
-; VI-NEXT:    v_fma_f32 v8, -v5, v7, v3
-; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; VI-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; VI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
+; VI-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; VI-NEXT:    v_fma_f32 v6, -v9, v11, v6
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
-; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, v4
-; VI-NEXT:    v_trunc_f32_e32 v3, v3
-; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v4
+; VI-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB3_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v7, v8
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 12, v4
+; VI-NEXT:  .LBB3_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v8, v5, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -12, v4
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; VI-NEXT:    v_ldexp_f32 v5, v5, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB3_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v5, v7
+; VI-NEXT:  .LBB3_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT:    v_ldexp_f32 v4, v5, v4
+; VI-NEXT:    v_mul_f32_e32 v5, v4, v6
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; VI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; VI-NEXT:  .LBB3_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v5, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v5
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    flat_store_dword v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX9-NEXT:    global_load_dword v0, v2, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v2, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_div_scale_f32 v4, s[2:3], v2, v2, v1
-; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v1, v2, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB3_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB3_3
+; GFX9-NEXT:    s_branch .LBB3_8
+; GFX9-NEXT:  .LBB3_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  .LBB3_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX9-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v3, v3, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX9-NEXT:    v_ldexp_f32 v5, v2, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, -1, v8
+; GFX9-NEXT:    v_not_b32_e32 v4, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; GFX9-NEXT:    v_fma_f32 v5, v6, v5, v5
-; GFX9-NEXT:    v_mul_f32_e32 v6, v3, v5
-; GFX9-NEXT:    v_fma_f32 v7, -v4, v6, v3
-; GFX9-NEXT:    v_fma_f32 v6, v7, v5, v6
-; GFX9-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
+; GFX9-NEXT:    v_mul_f32_e32 v11, v6, v10
+; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
+; GFX9-NEXT:    v_fma_f32 v6, -v9, v11, v6
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v4
+; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB3_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v4, v7, v8
+; GFX9-NEXT:    v_add_u32_e32 v4, 12, v4
+; GFX9-NEXT:  .LBB3_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v6
+; GFX9-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX9-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_add_f32_e32 v8, v5, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, -12, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v4
+; GFX9-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB3_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-NEXT:  .LBB3_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v4, -11, v4
+; GFX9-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v6
+; GFX9-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX9-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX9-NEXT:  .LBB3_8: ; %Flow17
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    global_store_dword v3, v0, s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_f32:
@@ -730,28 +2566,90 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX10-NEXT:    global_load_dword v0, v2, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v2, s[6:7] offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v4, s2, v2, v2, v1
-; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v1|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB3_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB3_3
+; GFX10-NEXT:    s_branch .LBB3_8
+; GFX10-NEXT:  .LBB3_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  .LBB3_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v5, v0
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX10-NEXT:    v_ldexp_f32 v4, v2, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX10-NEXT:    v_div_scale_f32 v7, s4, v3, v3, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX10-NEXT:    v_not_b32_e32 v6, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v5
-; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
-; GFX10-NEXT:    v_fma_f32 v7, -v4, v6, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v5
-; GFX10-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX10-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX10-NEXT:    v_fma_f32 v5, -v7, v9, v5
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
+; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB3_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB3_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v4
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v4, -v4, v3, v7
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v7
+; GFX10-NEXT:  .LBB3_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -11, v6
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX10-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX10-NEXT:  .LBB3_8: ; %Flow17
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v1, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_f32:
@@ -759,34 +2657,111 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX11-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
-; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB3_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB3_3
+; GFX11-NEXT:    s_branch .LBB3_8
+; GFX11-NEXT:  .LBB3_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  .LBB3_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX11-NEXT:    v_ldexp_f32 v4, v2, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX11-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v5
+; GFX11-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v3, v5
-; GFX11-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX11-NEXT:    v_mul_f32_e32 v9, v5, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v5
-; GFX11-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX11-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v7, v9, v5
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB3_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB3_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v7, v4
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; GFX11-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_fma_f32 v4, -v4, v3, v7
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v6, s2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v7
+; GFX11-NEXT:  .LBB3_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -11, v6
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX11-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX11-NEXT:  .LBB3_8: ; %Flow17
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v1, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7fc00000, v2
+; GFX11-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f32:
@@ -794,35 +2769,115 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
-; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v2, v3
+; GFX1150-NEXT:    s_cbranch_vccz .LBB3_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB3_3
+; GFX1150-NEXT:    s_branch .LBB3_8
+; GFX1150-NEXT:  .LBB3_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB3_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v5, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    v_ldexp_f32 v4, v2, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v6, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
 ; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v5
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v7, -v4, v6, v3
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX1150-NEXT:    v_fma_f32 v5, -v7, v9, v5
 ; GFX1150-NEXT:    s_denorm_mode 12
-; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB3_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 12
+; GFX1150-NEXT:  .LBB3_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1150-NEXT:    s_add_i32 s2, s2, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 12
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
-; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v6, s2
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-NEXT:  .LBB3_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -11, v6
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX1150-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX1150-NEXT:  .LBB3_8: ; %Flow17
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s2, v1, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7fc00000, v2
+; GFX1150-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -841,22 +2896,97 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
 ; SI-NEXT:    s_mov_b32 s6, s10
 ; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    v_cmp_le_f32_e64 s[2:3], |v0|, |v1|
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccz .LBB4_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB4_3
+; SI-NEXT:    s_branch .LBB4_8
+; SI-NEXT:  .LBB4_2:
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB4_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s6, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v0|, s6
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; SI-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    s_cselect_b32 s4, s4, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v0|, v2, s[2:3]
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, s6
+; SI-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; SI-NEXT:    v_cndmask_b32_e64 v3, |v1|, v3, s[2:3]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; SI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v1
+; SI-NEXT:    s_cselect_b32 s5, s2, 0
+; SI-NEXT:    s_add_i32 s2, s5, -1
+; SI-NEXT:    v_ldexp_f32_e64 v1, v3, 1
+; SI-NEXT:    s_not_b32 s3, s2
+; SI-NEXT:    s_add_i32 s3, s3, s4
+; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v1, 1.0
+; SI-NEXT:    v_div_scale_f32 v4, s[6:7], v1, v1, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v5, v4
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
+; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
+; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
+; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; SI-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s3, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB4_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s3, s4, s5
+; SI-NEXT:    s_add_i32 s3, s3, 12
+; SI-NEXT:  .LBB4_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mul_f32_e32 v2, v4, v3
+; SI-NEXT:    v_rndne_f32_e32 v2, v2
+; SI-NEXT:    v_fma_f32 v2, -v2, v1, v4
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_add_f32_e32 v5, v2, v1
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, 12
+; SI-NEXT:    s_add_i32 s3, s3, -12
+; SI-NEXT:    s_cmp_gt_i32 s3, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB4_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:  .LBB4_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s3, s3, -11
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, s3
+; SI-NEXT:    v_mul_f32_e32 v3, v2, v3
+; SI-NEXT:    v_rndne_f32_e32 v3, v3
+; SI-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_add_f32_e32 v1, v2, v1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v1, v1, s2
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_xor_b32_e32 v2, v0, v1
+; SI-NEXT:  .LBB4_8: ; %Flow17
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: fast_frem_f32:
@@ -867,21 +2997,83 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    s_mov_b32 s10, -1
 ; CI-NEXT:    s_mov_b32 s6, s10
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
 ; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_rcp_f32_e32 v2, v1
-; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; CI-NEXT:    v_cmp_le_f32_e64 s[2:3], |v0|, |v1|
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB4_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; CI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; CI-NEXT:    s_cbranch_execz .LBB4_3
+; CI-NEXT:    s_branch .LBB4_8
+; CI-NEXT:  .LBB4_2:
+; CI-NEXT:    ; implicit-def: $vgpr2
+; CI-NEXT:  .LBB4_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; CI-NEXT:    v_ldexp_f32_e64 v4, v2, 12
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; CI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT:    v_div_scale_f32 v8, s[2:3], v2, v2, 1.0
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v1
+; CI-NEXT:    v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; CI-NEXT:    v_not_b32_e32 v3, v1
+; CI-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v9, v8
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
+; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
+; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v3
+; CI-NEXT:    v_div_fixup_f32 v5, v5, v2, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB4_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v6, v7
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 12, v3
+; CI-NEXT:  .LBB4_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_mul_f32_e32 v4, v6, v5
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v2, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v7, v4, v2
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB4_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v4, v6
+; CI-NEXT:  .LBB4_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT:    v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT:    v_mul_f32_e32 v4, v3, v5
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_add_f32_e32 v2, v3, v2
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CI-NEXT:    v_xor_b32_e32 v2, v0, v1
+; CI-NEXT:  .LBB4_8: ; %Flow17
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fast_frem_f32:
@@ -889,21 +3081,85 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s4, 16
+; VI-NEXT:    s_addc_u32 s3, s5, 0
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_load_dword v1, v[1:2]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_le_f32_e64 s[2:3], |v0|, |v1|
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB4_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; VI-NEXT:    s_cbranch_execz .LBB4_3
+; VI-NEXT:    s_branch .LBB4_8
+; VI-NEXT:  .LBB4_2:
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB4_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; VI-NEXT:    v_ldexp_f32 v4, v2, 12
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; VI-NEXT:    v_ldexp_f32 v2, v2, 1
+; VI-NEXT:    v_div_scale_f32 v8, s[2:3], v2, v2, 1.0
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT:    v_not_b32_e32 v3, v1
+; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
+; VI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v9, v8
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; VI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v5, v9
+; VI-NEXT:    v_fma_f32 v11, -v8, v10, v5
+; VI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; VI-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v3
+; VI-NEXT:    v_div_fixup_f32 v5, v5, v2, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB4_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v6, v7
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 12, v3
+; VI-NEXT:  .LBB4_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_mul_f32_e32 v4, v6, v5
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v2, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v7, v4, v2
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT:    v_ldexp_f32 v4, v4, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB4_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:  .LBB4_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT:    v_ldexp_f32 v3, v4, v3
+; VI-NEXT:    v_mul_f32_e32 v4, v3, v5
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT:    v_ldexp_f32 v1, v2, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; VI-NEXT:    v_xor_b32_e32 v2, v0, v1
+; VI-NEXT:  .LBB4_8: ; %Flow17
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_dword v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_rcp_f32_e32 v3, v2
-; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
-; VI-NEXT:    v_trunc_f32_e32 v3, v3
-; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -911,16 +3167,81 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX9-NEXT:    global_load_dword v0, v2, s[2:3]
+; GFX9-NEXT:    global_load_dword v1, v2, s[6:7] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[2:3], |v0|, |v1|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_vccz .LBB4_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB4_3
+; GFX9-NEXT:    s_branch .LBB4_8
+; GFX9-NEXT:  .LBB4_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  .LBB4_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX9-NEXT:    v_ldexp_f32 v4, v2, 12
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; GFX9-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX9-NEXT:    v_div_scale_f32 v8, s[2:3], v2, v2, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v1
+; GFX9-NEXT:    v_add_u32_e32 v1, -1, v7
+; GFX9-NEXT:    v_not_b32_e32 v3, v1
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v9, v8
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
+; GFX9-NEXT:    v_mul_f32_e32 v10, v5, v9
+; GFX9-NEXT:    v_fma_f32 v11, -v8, v10, v5
+; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
+; GFX9-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v3
+; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v2, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB4_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v3, v6, v7
+; GFX9-NEXT:    v_add_u32_e32 v3, 12, v3
+; GFX9-NEXT:  .LBB4_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v6, v5
+; GFX9-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX9-NEXT:    v_fma_f32 v4, -v4, v2, v6
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_add_f32_e32 v7, v4, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, -12, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v3
+; GFX9-NEXT:    v_ldexp_f32 v4, v4, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB4_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:  .LBB4_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v3, -11, v3
+; GFX9-NEXT:    v_ldexp_f32 v3, v4, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v5
+; GFX9-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX9-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX9-NEXT:  .LBB4_8: ; %Flow17
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fast_frem_f32:
@@ -928,17 +3249,84 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX10-NEXT:    global_load_dword v0, v2, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v2, s[6:7] offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_cmp_le_f32_e64 s2, |v0|, |v1|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB4_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_3
+; GFX10-NEXT:    s_branch .LBB4_8
+; GFX10-NEXT:  .LBB4_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  .LBB4_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v0
+; GFX10-NEXT:    v_ldexp_f32 v3, v2, 12
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX10-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX10-NEXT:    v_div_scale_f32 v6, s4, v2, v2, 1.0
+; GFX10-NEXT:    v_not_b32_e32 v5, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX10-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
+; GFX10-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB4_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB4_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX10-NEXT:    v_fma_f32 v3, -v3, v2, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v5, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v6
+; GFX10-NEXT:  .LBB4_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, -11, v5
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX10-NEXT:  .LBB4_8: ; %Flow17
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fast_frem_f32:
@@ -946,19 +3334,105 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX11-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_le_f32_e64 s2, |v0|, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB4_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB4_3
+; GFX11-NEXT:    s_branch .LBB4_8
+; GFX11-NEXT:  .LBB4_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  .LBB4_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v3, v2, 12
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX11-NEXT:    v_not_b32_e32 v5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX11-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX11-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB4_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB4_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX11-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v3, -v3, v2, v6
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    v_add_f32_e32 v5, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v5, s2
+; GFX11-NEXT:    v_mov_b32_e32 v3, v6
+; GFX11-NEXT:  .LBB4_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, -11, v5
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX11-NEXT:  .LBB4_8: ; %Flow17
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f32:
@@ -966,20 +3440,109 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_le_f32_e32 vcc_lo, v2, v3
+; GFX1150-NEXT:    s_cbranch_vccz .LBB4_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB4_3
+; GFX1150-NEXT:    s_branch .LBB4_8
+; GFX1150-NEXT:  .LBB4_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB4_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v2, 12
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX1150-NEXT:    v_not_b32_e32 v5, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX1150-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1150-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
+; GFX1150-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB4_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 12
+; GFX1150-NEXT:  .LBB4_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1150-NEXT:    s_add_i32 s2, s2, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v3, v3
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1150-NEXT:    v_fma_f32 v3, v3, v2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB4_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v5, s2
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v6
+; GFX1150-NEXT:  .LBB4_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, -11, v5
+; GFX1150-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX1150-NEXT:    v_xor_b32_e32 v2, v0, v1
+; GFX1150-NEXT:  .LBB4_8: ; %Flow17
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -993,51 +3556,182 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: unsafe_frem_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s8, s0
-; SI-NEXT:    s_mov_b32 s9, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
-; SI-NEXT:    s_mov_b32 s2, s10
-; SI-NEXT:    s_mov_b32 s3, s11
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_rcp_f32_e32 v2, v1
-; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB5_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB5_3
+; SI-NEXT:    s_branch .LBB5_8
+; SI-NEXT:  .LBB5_2:
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB5_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v2
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v0|, v2, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v4, v2, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v2, |v1|
+; SI-NEXT:    v_cndmask_b32_e64 v2, |v1|, v2, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v3
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, 1
+; SI-NEXT:    v_rcp_f32_e32 v3, v2
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB5_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB5_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v5, v4
+; SI-NEXT:    v_mul_f32_e32 v4, v5, v3
+; SI-NEXT:    v_rndne_f32_e32 v4, v4
+; SI-NEXT:    v_fma_f32 v4, -v4, v2, v5
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; SI-NEXT:    v_add_f32_e32 v6, v4, v2
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB5_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v4, v5
+; SI-NEXT:  .LBB5_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, s1
+; SI-NEXT:    v_mul_f32_e32 v3, v4, v3
+; SI-NEXT:    v_rndne_f32_e32 v3, v3
+; SI-NEXT:    v_fma_f32 v3, -v3, v2, v4
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_add_f32_e32 v2, v3, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v2, v2, s0
+; SI-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; SI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; SI-NEXT:  .LBB5_8: ; %Flow17
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3fc
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v3
+; SI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: unsafe_frem_f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s11, 0xf000
-; CI-NEXT:    s_mov_b32 s10, -1
-; CI-NEXT:    s_mov_b32 s6, s10
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_rcp_f32_e32 v2, v1
-; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB5_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; CI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; CI-NEXT:    s_cbranch_execz .LBB5_3
+; CI-NEXT:    s_branch .LBB5_8
+; CI-NEXT:  .LBB5_2:
+; CI-NEXT:    ; implicit-def: $vgpr2
+; CI-NEXT:  .LBB5_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; CI-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; CI-NEXT:    v_ldexp_f32_e64 v6, v2, 12
+; CI-NEXT:    v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT:    v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; CI-NEXT:    v_rcp_f32_e32 v4, v3
+; CI-NEXT:    v_not_b32_e32 v5, v2
+; CI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v5
+; CI-NEXT:    s_cbranch_vccnz .LBB5_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, v7, v8
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 12, v5
+; CI-NEXT:  .LBB5_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v6
+; CI-NEXT:    v_mul_f32_e32 v6, v7, v4
+; CI-NEXT:    v_rndne_f32_e32 v6, v6
+; CI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT:    v_add_f32_e32 v8, v6, v3
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -12, v5
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v5
+; CI-NEXT:    v_ldexp_f32_e64 v6, v6, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB5_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v6, v7
+; CI-NEXT:  .LBB5_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT:    v_ldexp_f32_e32 v5, v6, v5
+; CI-NEXT:    v_mul_f32_e32 v4, v5, v4
+; CI-NEXT:    v_rndne_f32_e32 v4, v4
+; CI-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_add_f32_e32 v3, v4, v3
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; CI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; CI-NEXT:  .LBB5_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v3, 0x3fc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v3
+; CI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -1046,38 +3740,161 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_dword v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_dword v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s4, 16
+; VI-NEXT:    s_addc_u32 s3, s5, 0
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    flat_load_dword v1, v[1:2]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_rcp_f32_e32 v3, v2
-; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
-; VI-NEXT:    v_trunc_f32_e32 v3, v3
-; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
-; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1|
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB5_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; VI-NEXT:    s_cbranch_execz .LBB5_3
+; VI-NEXT:    s_branch .LBB5_8
+; VI-NEXT:  .LBB5_2:
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB5_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; VI-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; VI-NEXT:    v_ldexp_f32 v6, v2, 12
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_ldexp_f32 v3, v3, 1
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; VI-NEXT:    v_rcp_f32_e32 v4, v3
+; VI-NEXT:    v_not_b32_e32 v5, v2
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v5
+; VI-NEXT:    s_cbranch_vccnz .LBB5_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v7, v8
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 12, v5
+; VI-NEXT:  .LBB5_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v6
+; VI-NEXT:    v_mul_f32_e32 v6, v7, v4
+; VI-NEXT:    v_rndne_f32_e32 v6, v6
+; VI-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT:    v_add_f32_e32 v8, v6, v3
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -12, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v5
+; VI-NEXT:    v_ldexp_f32 v6, v6, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB5_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v6, v7
+; VI-NEXT:  .LBB5_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT:    v_ldexp_f32 v5, v6, v5
+; VI-NEXT:    v_mul_f32_e32 v4, v5, v4
+; VI-NEXT:    v_rndne_f32_e32 v4, v4
+; VI-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; VI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; VI-NEXT:  .LBB5_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v5, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v5
+; VI-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    flat_store_dword v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: unsafe_frem_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX9-NEXT:    global_load_dword v0, v2, s[10:11]
+; GFX9-NEXT:    global_load_dword v1, v2, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB5_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB5_3
+; GFX9-NEXT:    s_branch .LBB5_8
+; GFX9-NEXT:  .LBB5_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  .LBB5_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; GFX9-NEXT:    v_ldexp_f32 v6, v2, 12
+; GFX9-NEXT:    v_add_u32_e32 v2, -1, v8
+; GFX9-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX9-NEXT:    v_not_b32_e32 v5, v2
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v5
+; GFX9-NEXT:    s_cbranch_vccnz .LBB5_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v5, v7, v8
+; GFX9-NEXT:    v_add_u32_e32 v5, 12, v5
+; GFX9-NEXT:  .LBB5_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v4
+; GFX9-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX9-NEXT:    v_fma_f32 v6, -v6, v3, v7
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_add_f32_e32 v8, v6, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, -12, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v5
+; GFX9-NEXT:    v_ldexp_f32 v6, v6, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB5_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-NEXT:  .LBB5_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v5, -11, v5
+; GFX9-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX9-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX9-NEXT:  .LBB5_8: ; %Flow17
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v1
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    global_store_dword v3, v0, s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: unsafe_frem_f32:
@@ -1085,17 +3902,78 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX10-NEXT:    global_load_dword v0, v2, s[2:3]
+; GFX10-NEXT:    global_load_dword v1, v2, s[6:7] offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v1|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB5_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB5_3
+; GFX10-NEXT:    s_branch .LBB5_8
+; GFX10-NEXT:  .LBB5_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  .LBB5_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; GFX10-NEXT:    v_ldexp_f32 v5, v2, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX10-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX10-NEXT:    v_not_b32_e32 v7, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
+; GFX10-NEXT:    s_cbranch_vccnz .LBB5_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB5_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v5
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v5, v6, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v5, -v5, v3, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    v_add_f32_e32 v7, v5, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB5_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v7, s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:  .LBB5_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -11, v7
+; GFX10-NEXT:    v_ldexp_f32 v5, v5, v6
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX10-NEXT:  .LBB5_8: ; %Flow17
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v1, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: unsafe_frem_f32:
@@ -1103,19 +3981,95 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX11-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB5_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB5_3
+; GFX11-NEXT:    s_branch .LBB5_8
+; GFX11-NEXT:  .LBB5_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  .LBB5_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v5, v2, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX11-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX11-NEXT:    v_not_b32_e32 v7, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
+; GFX11-NEXT:    s_cbranch_vccnz .LBB5_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB5_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v5
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v5, v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX11-NEXT:    v_fma_f32 v5, -v5, v3, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    v_add_f32_e32 v7, v5, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB5_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v7, s2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-NEXT:  .LBB5_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -11, v7
+; GFX11-NEXT:    v_ldexp_f32 v5, v5, v6
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX11-NEXT:    v_fma_f32 v4, -v4, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX11-NEXT:  .LBB5_8: ; %Flow17
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v1, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7fc00000, v2
+; GFX11-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f32:
@@ -1123,20 +4077,100 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v2, v3
+; GFX1150-NEXT:    s_cbranch_vccz .LBB5_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB5_3
+; GFX1150-NEXT:    s_branch .LBB5_8
+; GFX1150-NEXT:  .LBB5_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB5_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v6, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v2, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX1150-NEXT:    v_not_b32_e32 v7, v2
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB5_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 12
+; GFX1150-NEXT:  .LBB5_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v5
+; GFX1150-NEXT:    s_add_i32 s2, s2, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fma_f32 v5, v5, v3, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v7, v5, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB5_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v7, s2
+; GFX1150-NEXT:    v_mov_b32_e32 v5, v6
+; GFX1150-NEXT:  .LBB5_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -11, v7
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, v6
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX1150-NEXT:  .LBB5_8: ; %Flow17
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s2, v1, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, 0x7fc00000, v2
+; GFX1150-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
@@ -1150,94 +4184,243 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
 ; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
-; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
-; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB6_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB6_3
+; SI-NEXT:    s_branch .LBB6_8
+; SI-NEXT:  .LBB6_2:
+; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB6_3: ; %frem.compute
+; SI-NEXT:    s_brev_b32 s5, -2
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    s_mov_b32 s1, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v6
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; SI-NEXT:    v_and_b32_e32 v8, 0x7fffffff, v3
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v8
+; SI-NEXT:    s_cselect_b32 s7, s0, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s4, s7, -1
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; SI-NEXT:    s_not_b32 s0, s4
+; SI-NEXT:    s_add_i32 s6, s0, s3
+; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[4:5], v[4:5], 1.0
+; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], 1.0, v[4:5], 1.0
+; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
+; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v9
+; SI-NEXT:    s_mov_b32 s0, 0x3ff00000
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, v13
 ; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-NEXT:    s_nop 1
-; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
-; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s2, v5
-; SI-NEXT:    s_bfe_u32 s0, s2, 0xb0014
-; SI-NEXT:    s_add_i32 s3, s0, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s6
-; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s3
-; SI-NEXT:    v_not_b32_e32 v6, s0
-; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, s1
-; SI-NEXT:    v_and_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s0, s2, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s3, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, s0
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    s_cmp_gt_i32 s3, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s2
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
+; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; SI-NEXT:    s_cmp_lt_i32 s6, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB6_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s0, s3, s7
+; SI-NEXT:    s_add_i32 s6, s0, 26
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_mov_b32_e32 v14, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v10, 0
+; SI-NEXT:  .LBB6_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v13, v7
+; SI-NEXT:    v_mov_b32_e32 v12, v6
+; SI-NEXT:    v_mul_f64 v[6:7], v[12:13], v[8:9]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3]
+; SI-NEXT:    v_bfi_b32 v11, s5, v14, v7
+; SI-NEXT:    v_add_f64 v[15:16], v[6:7], v[10:11]
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], -v[10:11]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
+; SI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[12:13]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT:    v_add_f64 v[15:16], v[6:7], v[4:5]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v16, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v15, vcc
+; SI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; SI-NEXT:    s_sub_i32 s6, s6, 26
+; SI-NEXT:    s_cmp_gt_i32 s6, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB6_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v6, v12
+; SI-NEXT:    v_mov_b32_e32 v7, v13
+; SI-NEXT:  .LBB6_7: ; %frem.loop_exit
+; SI-NEXT:    s_sub_i32 s0, s6, 25
+; SI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], s0
+; SI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; SI-NEXT:    s_mov_b32 s0, -1
+; SI-NEXT:    s_mov_b32 s1, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[8:9]|, s[0:1]
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v10, 0x43300000
+; SI-NEXT:    v_bfi_b32 v11, s0, v10, v9
+; SI-NEXT:    v_mov_b32_e32 v10, 0
+; SI-NEXT:    v_add_f64 v[12:13], v[8:9], v[10:11]
+; SI-NEXT:    v_add_f64 v[10:11], v[12:13], -v[10:11]
+; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; SI-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], s4
+; SI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; SI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; SI-NEXT:  .LBB6_8: ; %Flow17
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; SI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v6
+; SI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; SI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s11, 0xf000
-; CI-NEXT:    s_mov_b32 s10, -1
-; CI-NEXT:    s_mov_b32 s6, s10
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
-; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
-; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB6_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; CI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; CI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB6_3
+; CI-NEXT:    s_branch .LBB6_8
+; CI-NEXT:  .LBB6_2:
+; CI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CI-NEXT:  .LBB6_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v11, v[2:3]
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; CI-NEXT:    v_add_i32_e32 v12, vcc, -1, v11
+; CI-NEXT:    v_not_b32_e32 v8, v12
+; CI-NEXT:    v_add_i32_e32 v13, vcc, v8, v10
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; CI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[4:5], v[4:5], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[14:15], v[8:9]
+; CI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0
+; CI-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; CI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0
+; CI-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; CI-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[4:5], 1.0
+; CI-NEXT:    v_mul_f64 v[18:19], v[16:17], v[14:15]
+; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
 ; CI-NEXT:    s_nop 1
-; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
-; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[14:15], v[18:19]
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v13
+; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB6_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
+; CI-NEXT:    v_add_i32_e32 v13, vcc, 26, v10
+; CI-NEXT:  .LBB6_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v11, v7
+; CI-NEXT:    v_mov_b32_e32 v10, v6
+; CI-NEXT:    v_mul_f64 v[6:7], v[10:11], v[8:9]
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT:    v_subrev_i32_e32 v13, vcc, 26, v13
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v13
+; CI-NEXT:    s_cbranch_vccnz .LBB6_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v6, v10
+; CI-NEXT:    v_mov_b32_e32 v7, v11
+; CI-NEXT:  .LBB6_7: ; %frem.loop_exit
+; CI-NEXT:    v_subrev_i32_e32 v10, vcc, 25, v13
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; CI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; CI-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; CI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; CI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; CI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; CI-NEXT:  .LBB6_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; CI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v6
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -1246,56 +4429,191 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
-; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
-; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB6_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; VI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB6_3
+; VI-NEXT:    s_branch .LBB6_8
+; VI-NEXT:  .LBB6_2:
+; VI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT:  .LBB6_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v11, v[2:3]
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; VI-NEXT:    v_add_u32_e32 v12, vcc, -1, v11
+; VI-NEXT:    v_not_b32_e32 v8, v12
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v8, v10
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; VI-NEXT:    v_div_scale_f64 v[8:9], s[2:3], v[4:5], v[4:5], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[14:15], v[8:9]
+; VI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0
+; VI-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; VI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0
+; VI-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; VI-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[4:5], 1.0
+; VI-NEXT:    v_mul_f64 v[18:19], v[16:17], v[14:15]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
 ; VI-NEXT:    s_nop 1
-; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
-; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[14:15], v[18:19]
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v13
+; VI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB6_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, v10, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 26, v10
+; VI-NEXT:  .LBB6_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v11, v7
+; VI-NEXT:    v_mov_b32_e32 v10, v6
+; VI-NEXT:    v_mul_f64 v[6:7], v[10:11], v[8:9]
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT:    v_subrev_u32_e32 v13, vcc, 26, v13
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v13
+; VI-NEXT:    s_cbranch_vccnz .LBB6_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v6, v10
+; VI-NEXT:    v_mov_b32_e32 v7, v11
+; VI-NEXT:  .LBB6_7: ; %frem.loop_exit
+; VI-NEXT:    v_subrev_u32_e32 v10, vcc, 25, v13
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; VI-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; VI-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; VI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; VI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; VI-NEXT:  .LBB6_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v8, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v8
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v12, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v12, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], v[0:1]
-; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX9-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB6_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB6_3
+; GFX9-NEXT:    s_branch .LBB6_8
+; GFX9-NEXT:  .LBB6_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:  .LBB6_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v11, v[2:3]
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX9-NEXT:    v_add_u32_e32 v12, -1, v11
+; GFX9-NEXT:    v_not_b32_e32 v8, v12
+; GFX9-NEXT:    v_add_u32_e32 v13, v8, v10
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[4:5], v[4:5], 1.0
+; GFX9-NEXT:    v_rcp_f64_e32 v[14:15], v[8:9]
+; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0
+; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], 1.0
+; GFX9-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; GFX9-NEXT:    v_div_scale_f64 v[16:17], vcc, 1.0, v[4:5], 1.0
+; GFX9-NEXT:    v_mul_f64 v[18:19], v[16:17], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[18:19], v[16:17]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
-; GFX9-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1]
+; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[14:15], v[18:19]
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v13
+; GFX9-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB6_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v10, v10, v11
+; GFX9-NEXT:    v_add_u32_e32 v13, 26, v10
+; GFX9-NEXT:  .LBB6_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v7
+; GFX9-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-NEXT:    v_mul_f64 v[6:7], v[10:11], v[8:9]
+; GFX9-NEXT:    v_subrev_u32_e32 v13, 26, v13
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v13
+; GFX9-NEXT:    s_cbranch_vccnz .LBB6_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-NEXT:  .LBB6_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_subrev_u32_e32 v10, 25, v13
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; GFX9-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; GFX9-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX9-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v5
+; GFX9-NEXT:  .LBB6_8: ; %Flow17
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v7
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_f64:
@@ -1303,26 +4621,94 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v12, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v12, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v12, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v12, 0x80000000, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[4:5], s2, v[2:3], v[2:3], v[0:1]
-; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
-; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
-; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[0:1]
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB6_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB6_3
+; GFX10-NEXT:    s_branch .LBB6_8
+; GFX10-NEXT:  .LBB6_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:  .LBB6_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v9, v[2:3]
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v9
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX10-NEXT:    v_not_b32_e32 v9, v13
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, v9, v8
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[15:16], -v[8:9], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[15:16], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[15:16], -v[8:9], v[10:11], 1.0
+; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[15:16], v[10:11]
+; GFX10-NEXT:    v_div_scale_f64 v[15:16], vcc_lo, 1.0, v[4:5], 1.0
+; GFX10-NEXT:    v_mul_f64 v[17:18], v[15:16], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[17:18], v[15:16]
+; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[17:18]
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v14
+; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB6_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 26
+; GFX10-NEXT:  .LBB6_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v7
+; GFX10-NEXT:    v_mov_b32_e32 v10, v6
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[10:11], v[8:9]
+; GFX10-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB6_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    v_mov_b32_e32 v14, s2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v11
+; GFX10-NEXT:  .LBB6_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 25, v14
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; GFX10-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; GFX10-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; GFX10-NEXT:    v_xor_b32_e32 v5, v12, v5
+; GFX10-NEXT:  .LBB6_8: ; %Flow17
+; GFX10-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 0x3fc
+; GFX10-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_f64:
@@ -1330,32 +4716,111 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v12, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v12, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v12, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v12, 0x80000000, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB6_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]|
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB6_3
+; GFX11-NEXT:    s_branch .LBB6_8
+; GFX11-NEXT:  .LBB6_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:  .LBB6_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v9, v[2:3]
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v9, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, v9, v8
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], 1.0
+; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX11-NEXT:    v_fma_f64 v[15:16], -v[8:9], v[10:11], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[15:16], v[10:11]
+; GFX11-NEXT:    v_fma_f64 v[15:16], -v[8:9], v[10:11], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[15:16], v[10:11]
+; GFX11-NEXT:    v_div_scale_f64 v[15:16], vcc_lo, 1.0, v[4:5], 1.0
+; GFX11-NEXT:    v_mul_f64 v[17:18], v[15:16], v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[17:18], v[15:16]
+; GFX11-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[17:18]
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB6_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 26
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB6_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v11, v7 :: v_dual_mov_b32 v10, v6
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[6:7], v[10:11], v[8:9]
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
-; GFX11-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_cndmask_b32 v6, v6, v14
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB6_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v6, v10
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v7, v11
+; GFX11-NEXT:  .LBB6_7: ; %frem.loop_exit
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
-; GFX11-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 25, v14
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; GFX11-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v5, v12, v5
+; GFX11-NEXT:  .LBB6_8: ; %Flow17
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 0x3fc
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f64:
@@ -1363,32 +4828,110 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v12, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v12, s[4:5]
+; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v12, 0x80000000, v1
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]|
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB6_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]|
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB6_3
+; GFX1150-NEXT:    s_branch .LBB6_8
+; GFX1150-NEXT:  .LBB6_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1150-NEXT:  .LBB6_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v9, v[2:3]
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[4:5], 26
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX1150-NEXT:    v_add_nc_u32_e32 v13, -1, v9
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v9, v13
+; GFX1150-NEXT:    v_add_nc_u32_e32 v14, v9, v8
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; GFX1150-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
-; GFX1150-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
+; GFX1150-NEXT:    v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], 1.0
+; GFX1150-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[15:16], -v[8:9], v[10:11], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[15:16], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[15:16], -v[8:9], v[10:11], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[15:16], v[10:11]
+; GFX1150-NEXT:    v_div_scale_f64 v[15:16], vcc_lo, 1.0, v[4:5], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[17:18], v[15:16], v[10:11]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[17:18], v[15:16]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[17:18]
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v14
+; GFX1150-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[4:5], 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB6_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 26
+; GFX1150-NEXT:    .p2align 6
+; GFX1150-NEXT:  .LBB6_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v11, v7 :: v_dual_mov_b32 v10, v6
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
-; GFX1150-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1150-NEXT:    v_mul_f64 v[6:7], v[10:11], v[8:9]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[10:11]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[14:15], v[6:7], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v7, v7, v15 :: v_dual_cndmask_b32 v6, v6, v14
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB6_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v10
+; GFX1150-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v7, v11
+; GFX1150-NEXT:  .LBB6_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v10, 25, v14
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[8:9], v[6:7], v[8:9]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[8:9], v[4:5], v[6:7]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX1150-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
+; GFX1150-NEXT:    v_xor_b32_e32 v5, v12, v5
+; GFX1150-NEXT:  .LBB6_8: ; %Flow17
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1150-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -1402,52 +4945,133 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-LABEL: fast_frem_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s8
-; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_readfirstlane_b32 s6, v5
-; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, s2
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
-; SI-NEXT:    v_not_b32_e32 v6, s4
-; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, s5
-; SI-NEXT:    v_and_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s6
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    v_cmp_le_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB7_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB7_3
+; SI-NEXT:    s_branch .LBB7_8
+; SI-NEXT:  .LBB7_2:
+; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB7_3: ; %frem.compute
+; SI-NEXT:    s_brev_b32 s5, -2
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    s_mov_b32 s1, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v0
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v3
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[2:3]
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    s_cselect_b32 s7, s0, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s4, s7, -1
+; SI-NEXT:    v_ldexp_f64 v[2:3], v[6:7], 1
+; SI-NEXT:    s_not_b32 s0, s4
+; SI-NEXT:    s_add_i32 s6, s0, s3
+; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[2:3], v[2:3], 1.0
+; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
+; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], 1.0, v[2:3], 1.0
+; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
+; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; SI-NEXT:    s_mov_b32 s0, 0x3ff00000
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, v11
+; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
+; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[2:3], 1.0
+; SI-NEXT:    s_cmp_lt_i32 s6, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB7_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s0, s3, s7
+; SI-NEXT:    s_add_i32 s6, s0, 26
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:  .LBB7_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v11, v5
+; SI-NEXT:    v_mov_b32_e32 v10, v4
+; SI-NEXT:    v_mul_f64 v[4:5], v[10:11], v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3]
+; SI-NEXT:    v_bfi_b32 v9, s5, v0, v5
+; SI-NEXT:    v_add_f64 v[12:13], v[4:5], v[8:9]
+; SI-NEXT:    v_add_f64 v[12:13], v[12:13], -v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; SI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[10:11]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_add_f64 v[12:13], v[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; SI-NEXT:    s_sub_i32 s6, s6, 26
+; SI-NEXT:    s_cmp_gt_i32 s6, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB7_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v4, v10
+; SI-NEXT:    v_mov_b32_e32 v5, v11
+; SI-NEXT:  .LBB7_7: ; %frem.loop_exit
+; SI-NEXT:    s_sub_i32 s0, s6, 25
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], s0
+; SI-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
+; SI-NEXT:    s_mov_b32 s0, -1
+; SI-NEXT:    s_mov_b32 s1, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[0:1]
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; SI-NEXT:    v_bfi_b32 v9, s0, v0, v7
+; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_add_f64 v[10:11], v[6:7], v[8:9]
+; SI-NEXT:    v_add_f64 v[8:9], v[10:11], -v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; SI-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[2:3], v[4:5]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[2:3], s4
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    v_xor_b32_e32 v5, v0, v5
+; SI-NEXT:  .LBB7_8: ; %Flow17
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: fast_frem_f64:
@@ -1458,27 +5082,86 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI-NEXT:    s_mov_b32 s10, -1
 ; CI-NEXT:    s_mov_b32 s6, s10
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
+; CI-NEXT:    s_mov_b32 s8, s2
+; CI-NEXT:    s_mov_b32 s9, s3
 ; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; CI-NEXT:    buffer_load_dwordx2 v[6:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; CI-NEXT:    v_cmp_le_f64_e64 s[2:3], |v[0:1]|, |v[6:7]|
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB7_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[6:7]|
+; CI-NEXT:    v_and_b32_e32 v2, 0x80000000, v1
+; CI-NEXT:    v_cndmask_b32_e32 v3, v1, v2, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB7_3
+; CI-NEXT:    s_branch .LBB7_8
+; CI-NEXT:  .LBB7_2:
+; CI-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CI-NEXT:  .LBB7_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v9, v[6:7]
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[0:1]
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[2:3], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[6:7]|
+; CI-NEXT:    v_add_i32_e32 v0, vcc, -1, v9
+; CI-NEXT:    v_not_b32_e32 v6, v0
+; CI-NEXT:    v_add_i32_e32 v10, vcc, v6, v8
+; CI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 1
+; CI-NEXT:    v_div_scale_f64 v[6:7], s[2:3], v[2:3], v[2:3], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[11:12], v[6:7]
+; CI-NEXT:    v_fma_f64 v[13:14], -v[6:7], v[11:12], 1.0
+; CI-NEXT:    v_fma_f64 v[11:12], v[11:12], v[13:14], v[11:12]
+; CI-NEXT:    v_fma_f64 v[13:14], -v[6:7], v[11:12], 1.0
+; CI-NEXT:    v_fma_f64 v[11:12], v[11:12], v[13:14], v[11:12]
+; CI-NEXT:    v_div_scale_f64 v[13:14], vcc, 1.0, v[2:3], 1.0
+; CI-NEXT:    v_mul_f64 v[15:16], v[13:14], v[11:12]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[15:16], v[13:14]
+; CI-NEXT:    s_nop 1
+; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[15:16]
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[2:3], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB7_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v8, vcc, v8, v9
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 26, v8
+; CI-NEXT:  .LBB7_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v9, v5
+; CI-NEXT:    v_mov_b32_e32 v8, v4
+; CI-NEXT:    v_mul_f64 v[4:5], v[8:9], v[6:7]
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[8:9]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[11:12], v[4:5], v[2:3]
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_subrev_i32_e32 v10, vcc, 26, v10
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; CI-NEXT:    s_cbranch_vccnz .LBB7_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v4, v8
+; CI-NEXT:    v_mov_b32_e32 v5, v9
+; CI-NEXT:  .LBB7_7: ; %frem.loop_exit
+; CI-NEXT:    v_subrev_i32_e32 v8, vcc, 25, v10
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; CI-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; CI-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[2:3], v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v0
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
+; CI-NEXT:    v_xor_b32_e32 v3, v0, v3
+; CI-NEXT:  .LBB7_8: ; %Flow17
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: fast_frem_f64:
@@ -1486,25 +5169,86 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    flat_load_dwordx2 v[6:7], v[2:3]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_le_f64_e64 s[2:3], |v[0:1]|, |v[6:7]|
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB7_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[6:7]|
+; VI-NEXT:    v_and_b32_e32 v2, 0x80000000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v1, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB7_3
+; VI-NEXT:    s_branch .LBB7_8
+; VI-NEXT:  .LBB7_2:
+; VI-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; VI-NEXT:  .LBB7_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v9, v[6:7]
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[0:1]
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[2:3], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[6:7]|
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v9
+; VI-NEXT:    v_not_b32_e32 v6, v0
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v6, v8
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 1
+; VI-NEXT:    v_div_scale_f64 v[6:7], s[2:3], v[2:3], v[2:3], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[11:12], v[6:7]
+; VI-NEXT:    v_fma_f64 v[13:14], -v[6:7], v[11:12], 1.0
+; VI-NEXT:    v_fma_f64 v[11:12], v[11:12], v[13:14], v[11:12]
+; VI-NEXT:    v_fma_f64 v[13:14], -v[6:7], v[11:12], 1.0
+; VI-NEXT:    v_fma_f64 v[11:12], v[11:12], v[13:14], v[11:12]
+; VI-NEXT:    v_div_scale_f64 v[13:14], vcc, 1.0, v[2:3], 1.0
+; VI-NEXT:    v_mul_f64 v[15:16], v[13:14], v[11:12]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[15:16], v[13:14]
+; VI-NEXT:    s_nop 1
+; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[15:16]
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[2:3], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB7_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, v8, v9
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 26, v8
+; VI-NEXT:  .LBB7_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v9, v5
+; VI-NEXT:    v_mov_b32_e32 v8, v4
+; VI-NEXT:    v_mul_f64 v[4:5], v[8:9], v[6:7]
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[8:9]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[11:12], v[4:5], v[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_subrev_u32_e32 v10, vcc, 26, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; VI-NEXT:    s_cbranch_vccnz .LBB7_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v4, v8
+; VI-NEXT:    v_mov_b32_e32 v5, v9
+; VI-NEXT:  .LBB7_7: ; %frem.loop_exit
+; VI-NEXT:    v_subrev_u32_e32 v8, vcc, 25, v10
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; VI-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[2:3], v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v0
+; VI-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
+; VI-NEXT:    v_xor_b32_e32 v3, v0, v3
+; VI-NEXT:  .LBB7_8: ; %Flow17
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
-; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
-; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
-; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
-; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1512,22 +5256,84 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_le_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_vccz .LBB7_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB7_3
+; GFX9-NEXT:    s_branch .LBB7_8
+; GFX9-NEXT:  .LBB7_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:  .LBB7_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v9, v[2:3]
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[0:1]
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[6:7], 1
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v9
+; GFX9-NEXT:    v_not_b32_e32 v6, v0
+; GFX9-NEXT:    v_add_u32_e32 v10, v6, v8
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[2:3], v[2:3], v[2:3], 1.0
+; GFX9-NEXT:    v_rcp_f64_e32 v[11:12], v[6:7]
+; GFX9-NEXT:    v_fma_f64 v[13:14], -v[6:7], v[11:12], 1.0
+; GFX9-NEXT:    v_fma_f64 v[11:12], v[11:12], v[13:14], v[11:12]
+; GFX9-NEXT:    v_fma_f64 v[13:14], -v[6:7], v[11:12], 1.0
+; GFX9-NEXT:    v_fma_f64 v[11:12], v[11:12], v[13:14], v[11:12]
+; GFX9-NEXT:    v_div_scale_f64 v[13:14], vcc, 1.0, v[2:3], 1.0
+; GFX9-NEXT:    v_mul_f64 v[15:16], v[13:14], v[11:12]
+; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[15:16], v[13:14]
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[15:16]
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; GFX9-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[2:3], 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB7_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v8, v8, v9
+; GFX9-NEXT:    v_add_u32_e32 v10, 26, v8
+; GFX9-NEXT:  .LBB7_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-NEXT:    v_mul_f64 v[4:5], v[8:9], v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v10, 26, v10
+; GFX9-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[2:3], v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_add_f64 v[11:12], v[4:5], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v11, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; GFX9-NEXT:    s_cbranch_vccnz .LBB7_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-NEXT:  .LBB7_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 25, v10
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v8
+; GFX9-NEXT:    v_mul_f64 v[6:7], v[4:5], v[6:7]
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; GFX9-NEXT:    v_fma_f64 v[4:5], -v[6:7], v[2:3], v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[2:3], v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v5
+; GFX9-NEXT:  .LBB7_8: ; %Flow17
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: fast_frem_f64:
@@ -1535,23 +5341,87 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0x80000000, v3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
+; GFX10-NEXT:    v_cmp_le_f64_e64 s2, |v[2:3]|, |v[0:1]|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB7_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[0:1]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v2, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB7_3
+; GFX10-NEXT:    s_branch .LBB7_8
+; GFX10-NEXT:  .LBB7_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:  .LBB7_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[2:3]
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[2:3]|
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v0
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[4:5], 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 26
+; GFX10-NEXT:    v_not_b32_e32 v4, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v4, v6
+; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[11:12], -v[4:5], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[11:12], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[11:12], -v[4:5], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[11:12], v[6:7]
+; GFX10-NEXT:    v_div_scale_f64 v[11:12], vcc_lo, 1.0, v[0:1], 1.0
+; GFX10-NEXT:    v_mul_f64 v[13:14], v[11:12], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[13:14], v[11:12]
+; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[13:14]
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[0:1], 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB7_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 26
+; GFX10-NEXT:  .LBB7_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v3
+; GFX10-NEXT:    v_mov_b32_e32 v6, v2
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_mul_f64 v[2:3], v[6:7], v[4:5]
+; GFX10-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_add_f64 v[10:11], v[2:3], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v2, v6
+; GFX10-NEXT:    v_mov_b32_e32 v10, s2
+; GFX10-NEXT:    v_mov_b32_e32 v3, v7
+; GFX10-NEXT:  .LBB7_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 25, v10
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
+; GFX10-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; GFX10-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[0:1], v[2:3]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[0:1], v9
+; GFX10-NEXT:    v_xor_b32_e32 v5, v8, v5
+; GFX10-NEXT:  .LBB7_8: ; %Flow17
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: fast_frem_f64:
@@ -1559,28 +5429,104 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v10, 0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[4:5], v0, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v8, 0x80000000, v5
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX11-NEXT:    v_cmp_le_f64_e64 s2, |v[4:5]|, |v[0:1]|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB7_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[4:5]|, |v[0:1]|
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB7_3
+; GFX11-NEXT:    s_branch .LBB7_8
+; GFX11-NEXT:  .LBB7_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT:  .LBB7_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[4:5]
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[4:5]|
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v0
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[4:5], 1
+; GFX11-NEXT:    v_not_b32_e32 v4, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, v4, v6
+; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX11-NEXT:    v_fma_f64 v[11:12], -v[4:5], v[6:7], 1.0
+; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[11:12], v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[11:12], -v[4:5], v[6:7], 1.0
+; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[11:12], v[6:7]
+; GFX11-NEXT:    v_div_scale_f64 v[11:12], vcc_lo, 1.0, v[0:1], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[13:14], v[11:12], v[6:7]
+; GFX11-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[13:14], v[11:12]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[13:14]
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX11-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[0:1], 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB7_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 26
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[2:3], v[6:7], v[4:5]
+; GFX11-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_add_f64 v[10:11], v[2:3], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_cndmask_b32 v2, v2, v10
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v3, v7
+; GFX11-NEXT:  .LBB7_7: ; %frem.loop_exit
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX11-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 25, v10
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX11-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; GFX11-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[0:1], v[2:3]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[0:1], v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v3, v8, v3
+; GFX11-NEXT:  .LBB7_8: ; %Flow17
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f64:
@@ -1588,28 +5534,104 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
+; GFX1150-NEXT:    global_load_b64 v[4:5], v0, s[2:3]
+; GFX1150-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v8, 0x80000000, v5
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    v_cmp_le_f64_e64 s2, |v[4:5]|, |v[0:1]|
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB7_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[4:5]|, |v[0:1]|
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB7_3
+; GFX1150-NEXT:    s_branch .LBB7_8
+; GFX1150-NEXT:  .LBB7_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX1150-NEXT:  .LBB7_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[4:5]
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[2:3], |v[4:5]|
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1150-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1150-NEXT:    v_add_nc_u32_e32 v9, -1, v0
+; GFX1150-NEXT:    v_ldexp_f64 v[0:1], v[4:5], 1
+; GFX1150-NEXT:    v_not_b32_e32 v4, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v10, v4, v6
+; GFX1150-NEXT:    v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[11:12], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[11:12], v[6:7]
+; GFX1150-NEXT:    v_fma_f64 v[11:12], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[11:12], v[6:7]
+; GFX1150-NEXT:    v_div_scale_f64 v[11:12], vcc_lo, 1.0, v[0:1], 1.0
+; GFX1150-NEXT:    v_mul_f64 v[13:14], v[11:12], v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[13:14], v[11:12]
+; GFX1150-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[13:14]
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[0:1], 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB7_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 26
+; GFX1150-NEXT:    .p2align 6
+; GFX1150-NEXT:  .LBB7_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[2:3], v[6:7], v[4:5]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[0:1], v[6:7]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[2:3]
+; GFX1150-NEXT:    v_add_f64 v[10:11], v[2:3], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v3, v3, v11 :: v_dual_cndmask_b32 v2, v2, v10
+; GFX1150-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB7_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v2, v6
+; GFX1150-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v3, v7
+; GFX1150-NEXT:  .LBB7_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v6, 25, v10
+; GFX1150-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[4:5], v[4:5]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1150-NEXT:    v_fma_f64 v[2:3], -v[4:5], v[0:1], v[2:3]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[2:3]
+; GFX1150-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0
+; GFX1150-NEXT:    v_ldexp_f64 v[2:3], v[0:1], v9
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX1150-NEXT:    v_xor_b32_e32 v3, v8, v3
+; GFX1150-NEXT:  .LBB7_8: ; %Flow17
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -1623,82 +5645,229 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; SI-LABEL: unsafe_frem_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s8
-; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_readfirstlane_b32 s6, v5
-; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, s2
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
-; SI-NEXT:    v_not_b32_e32 v6, s4
-; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, s5
-; SI-NEXT:    v_and_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s6
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB8_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB8_3
+; SI-NEXT:    s_branch .LBB8_8
+; SI-NEXT:  .LBB8_2:
+; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB8_3: ; %frem.compute
+; SI-NEXT:    s_brev_b32 s3, -2
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    s_mov_b32 s5, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v6
+; SI-NEXT:    s_cselect_b32 s1, s0, 0
+; SI-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v3
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[2:3]
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v6
+; SI-NEXT:    s_cselect_b32 s5, s0, 0
+; SI-NEXT:    s_mov_b32 s0, -1
+; SI-NEXT:    s_add_i32 s2, s5, -1
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; SI-NEXT:    s_not_b32 s4, s2
+; SI-NEXT:    s_add_i32 s4, s4, s1
+; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; SI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; SI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; SI-NEXT:    s_cmp_lt_i32 s4, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB8_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s1, s5
+; SI-NEXT:    s_add_i32 s4, s1, 26
+; SI-NEXT:    s_mov_b32 s1, 0x432fffff
+; SI-NEXT:    v_mov_b32_e32 v14, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v10, 0
+; SI-NEXT:  .LBB8_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v13, v9
+; SI-NEXT:    v_mov_b32_e32 v12, v8
+; SI-NEXT:    v_mul_f64 v[8:9], v[12:13], v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[8:9]|, s[0:1]
+; SI-NEXT:    v_bfi_b32 v11, s3, v14, v9
+; SI-NEXT:    v_fma_f64 v[15:16], v[12:13], v[6:7], v[10:11]
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], -v[10:11]
+; SI-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v15, v8, vcc
+; SI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[12:13]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[8:9]
+; SI-NEXT:    v_add_f64 v[15:16], v[8:9], v[4:5]
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v16, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v15, vcc
+; SI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; SI-NEXT:    s_sub_i32 s4, s4, 26
+; SI-NEXT:    s_cmp_gt_i32 s4, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB8_5
+; SI-NEXT:  ; %bb.6: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v8, v12
+; SI-NEXT:    v_mov_b32_e32 v9, v13
+; SI-NEXT:  .LBB8_7: ; %frem.loop_exit
+; SI-NEXT:    s_sub_i32 s0, s4, 25
+; SI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], s0
+; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
+; SI-NEXT:    s_mov_b32 s0, -1
+; SI-NEXT:    s_mov_b32 s1, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[10:11]|, s[0:1]
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v12, 0x43300000
+; SI-NEXT:    v_bfi_b32 v13, s0, v12, v11
+; SI-NEXT:    v_mov_b32_e32 v12, 0
+; SI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[12:13]
+; SI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[12:13]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; SI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], s2
+; SI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; SI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; SI-NEXT:  .LBB8_8: ; %Flow17
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; SI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v6
+; SI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; SI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: unsafe_frem_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s11, 0xf000
-; CI-NEXT:    s_mov_b32 s10, -1
-; CI-NEXT:    s_mov_b32 s6, s10
+; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s8, s0
-; CI-NEXT:    s_mov_b32 s9, s1
-; CI-NEXT:    s_mov_b32 s0, s2
-; CI-NEXT:    s_mov_b32 s1, s3
-; CI-NEXT:    s_mov_b32 s2, s10
-; CI-NEXT:    s_mov_b32 s3, s11
-; CI-NEXT:    s_mov_b32 s7, s11
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB8_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; CI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; CI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB8_3
+; CI-NEXT:    s_branch .LBB8_8
+; CI-NEXT:  .LBB8_2:
+; CI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CI-NEXT:  .LBB8_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v11, v[2:3]
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; CI-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; CI-NEXT:    v_add_i32_e32 v12, vcc, -1, v11
+; CI-NEXT:    v_not_b32_e32 v6, v12
+; CI-NEXT:    v_add_i32_e32 v13, vcc, v6, v10
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v13
+; CI-NEXT:    s_and_b64 vcc, exec, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; CI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; CI-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; CI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; CI-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; CI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; CI-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; CI-NEXT:    s_cbranch_vccnz .LBB8_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
+; CI-NEXT:    v_add_i32_e32 v13, vcc, 26, v10
+; CI-NEXT:  .LBB8_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v11, v9
+; CI-NEXT:    v_mov_b32_e32 v10, v8
+; CI-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
+; CI-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[10:11]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[8:9]
+; CI-NEXT:    v_add_f64 v[14:15], v[8:9], v[4:5]
+; CI-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; CI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; CI-NEXT:    v_subrev_i32_e32 v13, vcc, 26, v13
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v13
+; CI-NEXT:    s_cbranch_vccnz .LBB8_5
+; CI-NEXT:  ; %bb.6: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v8, v10
+; CI-NEXT:    v_mov_b32_e32 v9, v11
+; CI-NEXT:  .LBB8_7: ; %frem.loop_exit
+; CI-NEXT:    v_subrev_i32_e32 v10, vcc, 25, v13
+; CI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v10
+; CI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[6:7]
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; CI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; CI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; CI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; CI-NEXT:  .LBB8_8: ; %Flow17
+; CI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; CI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v6
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -1707,48 +5876,183 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB8_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; VI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB8_3
+; VI-NEXT:    s_branch .LBB8_8
+; VI-NEXT:  .LBB8_2:
+; VI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT:  .LBB8_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v11, v[2:3]
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; VI-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; VI-NEXT:    v_add_u32_e32 v12, vcc, -1, v11
+; VI-NEXT:    v_not_b32_e32 v6, v12
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v6, v10
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v13
+; VI-NEXT:    s_and_b64 vcc, exec, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
 ; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
-; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
-; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
-; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
-; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
-; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; VI-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; VI-NEXT:    s_cbranch_vccnz .LBB8_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, v10, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 26, v10
+; VI-NEXT:  .LBB8_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v11, v9
+; VI-NEXT:    v_mov_b32_e32 v10, v8
+; VI-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
+; VI-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; VI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[10:11]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[8:9]
+; VI-NEXT:    v_add_f64 v[14:15], v[8:9], v[4:5]
+; VI-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; VI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; VI-NEXT:    v_subrev_u32_e32 v13, vcc, 26, v13
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v13
+; VI-NEXT:    s_cbranch_vccnz .LBB8_5
+; VI-NEXT:  ; %bb.6: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v8, v10
+; VI-NEXT:    v_mov_b32_e32 v9, v11
+; VI-NEXT:  .LBB8_7: ; %frem.loop_exit
+; VI-NEXT:    v_subrev_u32_e32 v10, vcc, 25, v13
+; VI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v10
+; VI-NEXT:    v_mul_f64 v[6:7], v[8:9], v[6:7]
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; VI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; VI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; VI-NEXT:  .LBB8_8: ; %Flow17
+; VI-NEXT:    v_mov_b32_e32 v8, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v6, s0
+; VI-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v8
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: unsafe_frem_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB8_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB8_3
+; GFX9-NEXT:    s_branch .LBB8_8
+; GFX9-NEXT:  .LBB8_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:  .LBB8_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v11, v[2:3]
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX9-NEXT:    v_add_u32_e32 v12, -1, v11
+; GFX9-NEXT:    v_not_b32_e32 v6, v12
+; GFX9-NEXT:    v_add_u32_e32 v13, v6, v10
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v13
+; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX9-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; GFX9-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; GFX9-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; GFX9-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; GFX9-NEXT:    v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0
+; GFX9-NEXT:    v_fma_f64 v[6:7], v[14:15], v[6:7], v[6:7]
+; GFX9-NEXT:    s_cbranch_vccnz .LBB8_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v10, v10, v11
+; GFX9-NEXT:    v_add_u32_e32 v13, 26, v10
+; GFX9-NEXT:  .LBB8_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v13, 26, v13
+; GFX9-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[10:11]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    v_add_f64 v[14:15], v[8:9], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v13
+; GFX9-NEXT:    s_cbranch_vccnz .LBB8_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-NEXT:    v_mov_b32_e32 v9, v11
+; GFX9-NEXT:  .LBB8_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_subrev_u32_e32 v10, 25, v13
+; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v10
+; GFX9-NEXT:    v_mul_f64 v[6:7], v[8:9], v[6:7]
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v12
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v5
+; GFX9-NEXT:  .LBB8_8: ; %Flow17
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[0:1], v[2:3], v7
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7ff80000
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: unsafe_frem_f64:
@@ -1756,23 +6060,91 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v12, 0x80000000, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[0:1]
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB8_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB8_3
+; GFX10-NEXT:    s_branch .LBB8_8
+; GFX10-NEXT:  .LBB8_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:  .LBB8_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v7, v[2:3]
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX10-NEXT:    v_not_b32_e32 v7, v13
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, v7, v6
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v14
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX10-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX10-NEXT:    s_cbranch_vccnz .LBB8_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 26
+; GFX10-NEXT:  .LBB8_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v9
+; GFX10-NEXT:    v_mov_b32_e32 v10, v8
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
+; GFX10-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[8:9]
+; GFX10-NEXT:    v_add_f64 v[14:15], v[8:9], v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB8_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v8, v10
+; GFX10-NEXT:    v_mov_b32_e32 v14, s2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v11
+; GFX10-NEXT:  .LBB8_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 25, v14
+; GFX10-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v10
+; GFX10-NEXT:    v_mul_f64 v[6:7], v[8:9], v[6:7]
+; GFX10-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; GFX10-NEXT:    v_xor_b32_e32 v5, v12, v5
+; GFX10-NEXT:  .LBB8_8: ; %Flow17
+; GFX10-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 0x3fc
+; GFX10-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: unsafe_frem_f64:
@@ -1780,28 +6152,107 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v10, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v12, 0x80000000, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB8_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]|
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB8_3
+; GFX11-NEXT:    s_branch .LBB8_8
+; GFX11-NEXT:  .LBB8_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:  .LBB8_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v7, v[2:3]
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v7, v13
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v14
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX11-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX11-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX11-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX11-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
-; GFX11-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GFX11-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX11-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX11-NEXT:    s_cbranch_vccnz .LBB8_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 26
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB8_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
+; GFX11-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[8:9]
+; GFX11-NEXT:    v_add_f64 v[14:15], v[8:9], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_cndmask_b32 v8, v8, v14
+; GFX11-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB8_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v8, v10
+; GFX11-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v9, v11
+; GFX11-NEXT:  .LBB8_7: ; %frem.loop_exit
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
-; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 25, v14
+; GFX11-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[6:7], v[8:9], v[6:7]
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v5, v12, v5
+; GFX11-NEXT:  .LBB8_8: ; %Flow17
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 0x3fc
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f64:
@@ -1809,28 +6260,106 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
+; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v12, 0x80000000, v1
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[2:3]|
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB8_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[2:3]|
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB8_3
+; GFX1150-NEXT:    s_branch .LBB8_8
+; GFX1150-NEXT:  .LBB8_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1150-NEXT:  .LBB8_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v7, v[2:3]
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f64 v[8:9], v[4:5], 26
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[2:3]|
+; GFX1150-NEXT:    v_add_nc_u32_e32 v13, -1, v7
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v7, v13
+; GFX1150-NEXT:    v_add_nc_u32_e32 v14, v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v14
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 1
+; GFX1150-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7]
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB8_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 26
+; GFX1150-NEXT:    .p2align 6
+; GFX1150-NEXT:  .LBB8_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[8:9], v[10:11], v[6:7]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[8:9], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[4:5], v[10:11]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[8:9]
+; GFX1150-NEXT:    v_add_f64 v[14:15], v[8:9], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v9, v9, v15 :: v_dual_cndmask_b32 v8, v8, v14
+; GFX1150-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB8_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v8, v10
+; GFX1150-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v9, v11
+; GFX1150-NEXT:  .LBB8_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v10, 25, v14
+; GFX1150-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v10
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    v_mul_f64 v[6:7], v[8:9], v[6:7]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[6:7], v[6:7]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
-; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[4:5], v[8:9]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[4:5], v[6:7], v[4:5]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    v_dual_cndmask_b32 v5, v7, v5 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX1150-NEXT:    v_xor_b32_e32 v5, v12, v5
+; GFX1150-NEXT:  .LBB8_8: ; %Flow17
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s2, v[2:3], 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_cndmask_b32 v1, 0x7ff80000, v5
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1150-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -1844,223 +6373,796 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-LABEL: frem_v2f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s8
-; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
-; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
-; SI-NEXT:    v_rcp_f32_e32 v6, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v2
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v3
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v2|, |v3|
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v2
+; SI-NEXT:    s_cbranch_vccz .LBB9_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_bfi_b32 v7, s0, 0, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v5, v6
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB9_3
+; SI-NEXT:    s_branch .LBB9_8
+; SI-NEXT:  .LBB9_2:
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB9_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v4, v5
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v4
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v4, v5
+; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v5, v4, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v6
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v6
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v4, 1.0
+; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v4, v4, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v8, v7
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
-; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
-; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
+; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
+; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
+; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
-; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
-; SI-NEXT:    v_trunc_f32_e32 v4, v4
-; SI-NEXT:    v_fma_f32 v0, -v4, v2, v0
-; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
-; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
-; SI-NEXT:    v_rcp_f32_e32 v5, v4
+; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
+; SI-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB9_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB9_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; SI-NEXT:    v_rndne_f32_e32 v5, v5
+; SI-NEXT:    v_fma_f32 v5, -v5, v4, v7
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; SI-NEXT:    v_add_f32_e32 v8, v5, v4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB9_5
+; SI-NEXT:  ; %bb.6: ; %Flow58
+; SI-NEXT:    v_mov_b32_e32 v5, v7
+; SI-NEXT:  .LBB9_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, s1
+; SI-NEXT:    v_mul_f32_e32 v6, v5, v6
+; SI-NEXT:    v_rndne_f32_e32 v6, v6
+; SI-NEXT:    v_fma_f32 v5, -v6, v4, v5
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; SI-NEXT:    v_add_f32_e32 v4, v5, v4
+; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_and_b32_e32 v5, 0x8000, v5
+; SI-NEXT:    v_xor_b32_e32 v4, v5, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:  .LBB9_8:
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v1
+; SI-NEXT:    v_cvt_f32_f16_e64 v6, |v5|
+; SI-NEXT:    v_cvt_f32_f16_e64 v7, |v7|
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v7
+; SI-NEXT:    s_cbranch_vccz .LBB9_10
+; SI-NEXT:  ; %bb.9: ; %frem.else20
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v7
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB9_11
+; SI-NEXT:    s_branch .LBB9_16
+; SI-NEXT:  .LBB9_10:
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB9_11: ; %frem.compute19
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v6
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v5
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v5, v6
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v6, v5, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v5, v7
+; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v7
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v7
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v5, 1.0
+; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v5, v5, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v9, v8
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
-; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
-; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
-; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
-; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
-; SI-NEXT:    v_trunc_f32_e32 v2, v2
-; SI-NEXT:    v_fma_f32 v1, -v2, v3, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_endpgm
+; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
+; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
+; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
+; SI-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB9_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB9_13: ; %frem.loop_body27
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NEXT:    v_mul_f32_e32 v6, v8, v7
+; SI-NEXT:    v_rndne_f32_e32 v6, v6
+; SI-NEXT:    v_fma_f32 v6, -v6, v5, v8
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; SI-NEXT:    v_add_f32_e32 v9, v6, v5
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v6, v6, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB9_13
+; SI-NEXT:  ; %bb.14: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v6, v6, s1
+; SI-NEXT:    v_mul_f32_e32 v7, v6, v7
+; SI-NEXT:    v_rndne_f32_e32 v7, v7
+; SI-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; SI-NEXT:    v_add_f32_e32 v5, v6, v5
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_and_b32_e32 v6, 0x8000, v6
+; SI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:  .LBB9_16: ; %Flow57
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; SI-NEXT:    s_movk_i32 s4, 0x7c01
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; SI-NEXT:    s_movk_i32 s5, 0x7c00
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v3, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s6, s2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s0, s8
-; CI-NEXT:    s_mov_b32 s1, s9
-; CI-NEXT:    s_mov_b32 s8, s10
-; CI-NEXT:    s_mov_b32 s9, s11
-; CI-NEXT:    s_mov_b32 s10, s2
-; CI-NEXT:    s_mov_b32 s11, s3
-; CI-NEXT:    s_mov_b32 s7, s3
-; CI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; CI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:16
+; CI-NEXT:    s_brev_b32 s0, -2
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
-; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
-; CI-NEXT:    v_rcp_f32_e32 v6, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v2
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v2|, |v3|
+; CI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v2
+; CI-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v3
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB9_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_bfi_b32 v7, s0, 0, v2
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; CI-NEXT:    s_cbranch_execz .LBB9_3
+; CI-NEXT:    s_branch .LBB9_8
+; CI-NEXT:  .LBB9_2:
+; CI-NEXT:    ; implicit-def: $vgpr4
+; CI-NEXT:  .LBB9_3: ; %frem.compute
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v6
+; CI-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; CI-NEXT:    v_frexp_mant_f32_e32 v6, v5
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v5
+; CI-NEXT:    v_ldexp_f32_e64 v5, v6, 1
+; CI-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; CI-NEXT:    v_ldexp_f32_e64 v7, v4, 11
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v10
+; CI-NEXT:    v_not_b32_e32 v6, v4
+; CI-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v12, v11
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
-; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
-; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
+; CI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; CI-NEXT:    v_fma_f32 v12, v13, v12, v12
+; CI-NEXT:    v_mul_f32_e32 v13, v8, v12
+; CI-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; CI-NEXT:    v_fma_f32 v13, v14, v12, v13
+; CI-NEXT:    v_fma_f32 v8, -v11, v13, v8
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
-; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
-; CI-NEXT:    v_trunc_f32_e32 v4, v4
-; CI-NEXT:    v_fma_f32 v0, -v4, v2, v0
-; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
-; CI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
-; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_rcp_f32_e32 v5, v4
+; CI-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v6
+; CI-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB9_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v6, vcc, v9, v10
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 11, v6
+; CI-NEXT:  .LBB9_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v9, v7
+; CI-NEXT:    v_mul_f32_e32 v7, v9, v8
+; CI-NEXT:    v_rndne_f32_e32 v7, v7
+; CI-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT:    v_add_f32_e32 v10, v7, v5
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; CI-NEXT:    v_add_i32_e32 v6, vcc, -11, v6
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v6
+; CI-NEXT:    v_ldexp_f32_e64 v7, v7, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB9_5
+; CI-NEXT:  ; %bb.6: ; %Flow58
+; CI-NEXT:    v_mov_b32_e32 v7, v9
+; CI-NEXT:  .LBB9_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v6, vcc, -10, v6
+; CI-NEXT:    v_ldexp_f32_e32 v6, v7, v6
+; CI-NEXT:    v_mul_f32_e32 v7, v6, v8
+; CI-NEXT:    v_rndne_f32_e32 v7, v7
+; CI-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT:    v_add_f32_e32 v5, v6, v5
+; CI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v4, v5, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_and_b32_e32 v5, 0x8000, v5
+; CI-NEXT:    v_xor_b32_e32 v4, v5, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:  .LBB9_8:
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v1
+; CI-NEXT:    v_cvt_f32_f16_e64 v7, |v5|
+; CI-NEXT:    v_cvt_f32_f16_e64 v6, |v6|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v7, v6
+; CI-NEXT:    s_cbranch_vccz .LBB9_10
+; CI-NEXT:  ; %bb.9: ; %frem.else20
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v7, v6
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; CI-NEXT:    s_cbranch_execz .LBB9_11
+; CI-NEXT:    s_branch .LBB9_16
+; CI-NEXT:  .LBB9_10:
+; CI-NEXT:    ; implicit-def: $vgpr5
+; CI-NEXT:  .LBB9_11: ; %frem.compute19
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v7
+; CI-NEXT:    v_frexp_mant_f32_e32 v5, v7
+; CI-NEXT:    v_frexp_mant_f32_e32 v7, v6
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v6
+; CI-NEXT:    v_ldexp_f32_e64 v6, v7, 1
+; CI-NEXT:    v_div_scale_f32 v12, s[0:1], v6, v6, 1.0
+; CI-NEXT:    v_ldexp_f32_e64 v8, v5, 11
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -1, v11
+; CI-NEXT:    v_not_b32_e32 v7, v5
+; CI-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v6, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v13, v12
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
-; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
-; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
+; CI-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; CI-NEXT:    v_fma_f32 v13, v14, v13, v13
+; CI-NEXT:    v_mul_f32_e32 v14, v9, v13
+; CI-NEXT:    v_fma_f32 v15, -v12, v14, v9
+; CI-NEXT:    v_fma_f32 v14, v15, v13, v14
+; CI-NEXT:    v_fma_f32 v9, -v12, v14, v9
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
-; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
-; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v1, -v2, v3, v1
+; CI-NEXT:    v_div_fmas_f32 v9, v9, v13, v14
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v7
+; CI-NEXT:    v_div_fixup_f32 v9, v9, v6, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB9_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT:    v_sub_i32_e32 v7, vcc, v10, v11
+; CI-NEXT:    v_add_i32_e32 v7, vcc, 11, v7
+; CI-NEXT:  .LBB9_13: ; %frem.loop_body27
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v10, v8
+; CI-NEXT:    v_mul_f32_e32 v8, v10, v9
+; CI-NEXT:    v_rndne_f32_e32 v8, v8
+; CI-NEXT:    v_fma_f32 v8, -v8, v6, v10
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v8
+; CI-NEXT:    v_add_f32_e32 v11, v8, v6
+; CI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; CI-NEXT:    v_add_i32_e32 v7, vcc, -11, v7
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v7
+; CI-NEXT:    v_ldexp_f32_e64 v8, v8, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB9_13
+; CI-NEXT:  ; %bb.14: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v8, v10
+; CI-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT:    v_add_i32_e32 v7, vcc, -10, v7
+; CI-NEXT:    v_ldexp_f32_e32 v7, v8, v7
+; CI-NEXT:    v_mul_f32_e32 v8, v7, v9
+; CI-NEXT:    v_rndne_f32_e32 v8, v8
+; CI-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT:    v_add_f32_e32 v6, v7, v6
+; CI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v5, v6, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_and_b32_e32 v6, 0x8000, v6
+; CI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:  .LBB9_16: ; %Flow57
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    s_movk_i32 s4, 0x7c01
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; CI-NEXT:    s_movk_i32 s5, 0x7c00
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_or_b32_e32 v0, v1, v0
-; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v3
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v2
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v6
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; CI-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v1
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v0
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_or_b32_e32 v0, v3, v0
+; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    flat_load_dword v4, v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_load_dword v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_add_u32 s0, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s11
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    flat_load_dword v1, v[1:2]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; VI-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; VI-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT:    v_rcp_f32_e32 v8, v7
-; VI-NEXT:    v_mul_f32_e32 v9, v5, v8
-; VI-NEXT:    v_mad_f32 v10, -v7, v9, v5
-; VI-NEXT:    v_mac_f32_e32 v9, v10, v8
-; VI-NEXT:    v_mad_f32 v5, -v7, v9, v5
-; VI-NEXT:    v_mul_f32_e32 v5, v5, v8
-; VI-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; VI-NEXT:    v_add_f32_e32 v5, v5, v9
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; VI-NEXT:    s_cbranch_vccz .LBB9_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    v_bfi_b32 v2, s0, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; VI-NEXT:    s_cbranch_execz .LBB9_3
+; VI-NEXT:    s_branch .LBB9_8
+; VI-NEXT:  .LBB9_2:
+; VI-NEXT:    ; implicit-def: $vgpr2
+; VI-NEXT:  .LBB9_3: ; %frem.compute
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT:    v_ldexp_f32 v3, v4, 1
+; VI-NEXT:    v_div_scale_f32 v9, s[0:1], v3, v3, 1.0
+; VI-NEXT:    v_ldexp_f32 v5, v2, 11
+; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT:    v_not_b32_e32 v4, v2
+; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
+; VI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v10, v9
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
+; VI-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
+; VI-NEXT:    v_fma_f32 v6, -v9, v11, v6
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v4
+; VI-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB9_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v7, v8
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 11, v4
+; VI-NEXT:  .LBB9_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT:    v_add_f32_e32 v8, v5, v3
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; VI-NEXT:    v_ldexp_f32 v5, v5, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB9_5
+; VI-NEXT:  ; %bb.6: ; %Flow58
+; VI-NEXT:    v_mov_b32_e32 v5, v7
+; VI-NEXT:  .LBB9_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -10, v4
+; VI-NEXT:    v_ldexp_f32 v4, v5, v4
+; VI-NEXT:    v_mul_f32_e32 v5, v4, v6
+; VI-NEXT:    v_rndne_f32_e32 v5, v5
+; VI-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT:    v_ldexp_f32 v2, v3, v2
+; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; VI-NEXT:    v_xor_b32_e32 v2, v3, v2
+; VI-NEXT:  .LBB9_8:
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; VI-NEXT:    v_cvt_f32_f16_e64 v7, |v3|
+; VI-NEXT:    v_cvt_f32_f16_e64 v6, |v4|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v7, v6
+; VI-NEXT:    s_cbranch_vccz .LBB9_10
+; VI-NEXT:  ; %bb.9: ; %frem.else20
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    v_bfi_b32 v5, s0, 0, v3
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc
+; VI-NEXT:    s_cbranch_execz .LBB9_11
+; VI-NEXT:    s_branch .LBB9_16
+; VI-NEXT:  .LBB9_10:
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:  .LBB9_11: ; %frem.compute19
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v7
+; VI-NEXT:    v_frexp_mant_f32_e32 v5, v7
+; VI-NEXT:    v_frexp_mant_f32_e32 v7, v6
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v6
+; VI-NEXT:    v_ldexp_f32 v6, v7, 1
+; VI-NEXT:    v_div_scale_f32 v12, s[0:1], v6, v6, 1.0
+; VI-NEXT:    v_ldexp_f32 v8, v5, 11
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -1, v11
+; VI-NEXT:    v_not_b32_e32 v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v10
+; VI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v6, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v13, v12
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; VI-NEXT:    v_fma_f32 v13, v14, v13, v13
+; VI-NEXT:    v_mul_f32_e32 v14, v9, v13
+; VI-NEXT:    v_fma_f32 v15, -v12, v14, v9
+; VI-NEXT:    v_fma_f32 v14, v15, v13, v14
+; VI-NEXT:    v_fma_f32 v9, -v12, v14, v9
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v9, v9, v13, v14
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v7
+; VI-NEXT:    v_div_fixup_f32 v9, v9, v6, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB9_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT:    v_sub_u32_e32 v7, vcc, v10, v11
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 11, v7
+; VI-NEXT:  .LBB9_13: ; %frem.loop_body27
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v10, v8
+; VI-NEXT:    v_mul_f32_e32 v8, v10, v9
+; VI-NEXT:    v_rndne_f32_e32 v8, v8
+; VI-NEXT:    v_fma_f32 v8, -v8, v6, v10
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v8
+; VI-NEXT:    v_add_f32_e32 v11, v8, v6
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, -11, v7
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v7
+; VI-NEXT:    v_ldexp_f32 v8, v8, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB9_13
+; VI-NEXT:  ; %bb.14: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v8, v10
+; VI-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT:    v_add_u32_e32 v7, vcc, -10, v7
+; VI-NEXT:    v_ldexp_f32 v7, v8, v7
+; VI-NEXT:    v_mul_f32_e32 v8, v7, v9
+; VI-NEXT:    v_rndne_f32_e32 v8, v8
+; VI-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT:    v_add_f32_e32 v6, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; VI-NEXT:    v_ldexp_f32 v5, v6, v5
 ; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
-; VI-NEXT:    v_trunc_f16_e32 v5, v5
-; VI-NEXT:    v_fma_f16 v3, -v5, v6, v3
-; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; VI-NEXT:    v_and_b32_e32 v6, 0x8000, v3
+; VI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; VI-NEXT:  .LBB9_16: ; %Flow57
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v7, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v1, v6
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v0, v7
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v1
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v8, 0x7e00
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v4, v6
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v3, v7
+; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v4
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_rcp_f32_e32 v7, v6
-; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
-; VI-NEXT:    v_mad_f32 v9, -v6, v8, v5
-; VI-NEXT:    v_mac_f32_e32 v8, v9, v7
-; VI-NEXT:    v_mad_f32 v5, -v6, v8, v5
-; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
-; VI-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; VI-NEXT:    v_add_f32_e32 v5, v5, v8
-; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
-; VI-NEXT:    v_trunc_f16_e32 v5, v5
-; VI-NEXT:    v_fma_f16 v2, -v5, v2, v4
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX9-NEXT:    global_load_dword v1, v2, s[10:11]
+; GFX9-NEXT:    global_load_dword v0, v2, s[0:1] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v4, |v1|
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mac_f32_e32 v3, v5, v4
-; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, |v0|
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
+; GFX9-NEXT:    s_cbranch_vccz .LBB9_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v2, s0, 0, v1
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB9_3
+; GFX9-NEXT:    s_branch .LBB9_8
+; GFX9-NEXT:  .LBB9_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:  .LBB9_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; GFX9-NEXT:    v_ldexp_f32 v3, v4, 1
+; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v3, v3, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v5, v2, 11
+; GFX9-NEXT:    v_add_u32_e32 v2, -1, v8
+; GFX9-NEXT:    v_not_b32_e32 v4, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
+; GFX9-NEXT:    v_mul_f32_e32 v11, v6, v10
+; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v6
+; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
+; GFX9-NEXT:    v_fma_f32 v6, -v9, v11, v6
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v4
+; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v3, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB9_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v4, v7, v8
+; GFX9-NEXT:    v_add_u32_e32 v4, 11, v4
+; GFX9-NEXT:  .LBB9_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v6
+; GFX9-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX9-NEXT:    v_fma_f32 v5, -v5, v3, v7
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_add_f32_e32 v8, v5, v3
+; GFX9-NEXT:    v_add_u32_e32 v4, -11, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v4
+; GFX9-NEXT:    v_ldexp_f32 v5, v5, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB9_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow58
+; GFX9-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-NEXT:  .LBB9_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v4, -10, v4
+; GFX9-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v6
+; GFX9-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX9-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX9-NEXT:    v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX9-NEXT:    v_mac_f32_e32 v5, v8, v7
-; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v1
-; GFX9-NEXT:    v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v7
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v1
-; GFX9-NEXT:    v_fma_f16 v1, -v1, v6, v4
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x8000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX9-NEXT:  .LBB9_8:
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v6, |v3|
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v5
+; GFX9-NEXT:    s_cbranch_vccz .LBB9_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else20
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v4, s0, 0, v3
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB9_11
+; GFX9-NEXT:    s_branch .LBB9_16
+; GFX9-NEXT:  .LBB9_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr4
+; GFX9-NEXT:  .LBB9_11: ; %frem.compute19
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v9, v6
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v6, v5
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v10, v5
+; GFX9-NEXT:    v_ldexp_f32 v5, v6, 1
+; GFX9-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v7, v4, 11
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v10
+; GFX9-NEXT:    v_not_b32_e32 v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v9
+; GFX9-NEXT:    v_rcp_f32_e32 v12, v11
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; GFX9-NEXT:    v_fma_f32 v12, v13, v12, v12
+; GFX9-NEXT:    v_mul_f32_e32 v13, v8, v12
+; GFX9-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; GFX9-NEXT:    v_fma_f32 v13, v14, v12, v13
+; GFX9-NEXT:    v_fma_f32 v8, -v11, v13, v8
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v6
+; GFX9-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB9_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v6, v9, v10
+; GFX9-NEXT:    v_add_u32_e32 v6, 11, v6
+; GFX9-NEXT:  .LBB9_13: ; %frem.loop_body27
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v8
+; GFX9-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX9-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_add_f32_e32 v10, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, -11, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v6
+; GFX9-NEXT:    v_ldexp_f32 v7, v7, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB9_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; GFX9-NEXT:    v_add_u32_e32 v6, -10, v6
+; GFX9-NEXT:    v_ldexp_f32 v6, v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v8
+; GFX9-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX9-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x8000, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX9-NEXT:  .LBB9_16: ; %Flow57
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3fc
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[0:1], v0, v5
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[2:3], v1, v6
+; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f16_sdwa s[2:3], v0, v5 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_cmp_class_f16_e32 vcc, v3, v6
+; GFX9-NEXT:    v_cmp_neq_f16_sdwa s[0:1], v0, v7 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    global_store_dword v7, v0, s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_v2f16:
@@ -2068,45 +7170,175 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] offset:16
+; GFX10-NEXT:    global_load_dword v1, v2, s[2:3]
+; GFX10-NEXT:    global_load_dword v0, v2, s[6:7] offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v4, |v1|
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
-; GFX10-NEXT:    v_mad_f32 v7, -v4, v6, v3
-; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v5
-; GFX10-NEXT:    v_mad_f32 v3, -v4, v6, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
-; GFX10-NEXT:    v_mul_f32_e32 v7, v4, v6
-; GFX10-NEXT:    v_mad_f32 v8, -v5, v7, v4
-; GFX10-NEXT:    v_mac_f32_e32 v7, v8, v6
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v7, v4
-; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v6
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v7
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v3, |v0|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    s_cbranch_vccz .LBB9_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB9_3
+; GFX10-NEXT:    s_branch .LBB9_8
+; GFX10-NEXT:  .LBB9_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  .LBB9_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v6, v3
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v5, v4
+; GFX10-NEXT:    v_ldexp_f32 v4, v2, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v2, v3
+; GFX10-NEXT:    v_ldexp_f32 v3, v6, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX10-NEXT:    v_div_scale_f32 v7, s4, v3, v3, 1.0
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX10-NEXT:    v_not_b32_e32 v6, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX10-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB9_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB9_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v4
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX10-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX10-NEXT:    v_fma_f32 v4, -v4, v3, v7
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow58
+; GFX10-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-NEXT:    v_mov_b32_e32 v4, v7
+; GFX10-NEXT:  .LBB9_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX10-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX10-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0x8000, v1
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX10-NEXT:  .LBB9_8:
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v6, |v3|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    s_cbranch_vccz .LBB9_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else20
+; GFX10-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v3
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB9_11
+; GFX10-NEXT:    s_branch .LBB9_16
+; GFX10-NEXT:  .LBB9_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:  .LBB9_11: ; %frem.compute19
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v8, v5
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v7, v6
+; GFX10-NEXT:    v_ldexp_f32 v6, v4, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v5
+; GFX10-NEXT:    v_ldexp_f32 v5, v8, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX10-NEXT:    v_div_scale_f32 v9, s4, v5, v5, 1.0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX10-NEXT:    v_not_b32_e32 v8, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v8
+; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB9_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB9_13: ; %frem.loop_body27
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX10-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX10-NEXT:    v_fma_f32 v6, -v6, v5, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB9_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v8, s2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v9
+; GFX10-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, -10, v8
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX10-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX10-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX10-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0x8000, v3
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
-; GFX10-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX10-NEXT:    v_fma_f16 v1, -v4, v2, v1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX10-NEXT:  .LBB9_16: ; %Flow57
+; GFX10-NEXT:    v_cmp_class_f16_e64 s2, v0, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s3, v1, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s4, v3, 0x1f8
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_cmp_class_f16_sdwa s3, v0, v5 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_cmp_neq_f16_sdwa s2, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_and_b32 s3, s3, s4
+; GFX10-NEXT:    s_and_b32 vcc_lo, s3, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_v2f16:
@@ -2114,59 +7346,221 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX11-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX11-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    s_cbranch_vccz .LBB9_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB9_3
+; GFX11-NEXT:    s_branch .LBB9_8
+; GFX11-NEXT:  .LBB9_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:  .LBB9_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v6, v3
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v4, v2, 11
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v3
+; GFX11-NEXT:    v_ldexp_f32 v3, v6, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v6, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX11-NEXT:    v_mul_f32_e32 v9, v5, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB9_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB9_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v7, v4
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX11-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v4, v3, v7
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow58
+; GFX11-NEXT:    v_mov_b32_e32 v6, s2
+; GFX11-NEXT:    v_mov_b32_e32 v4, v7
+; GFX11-NEXT:  .LBB9_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX11-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
 ; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX11-NEXT:  .LBB9_8:
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v8, v7
-; GFX11-NEXT:    v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v7, |v3|
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v6, |v4|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v7, v6
+; GFX11-NEXT:    s_cbranch_vccz .LBB9_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else20
+; GFX11-NEXT:    v_bfi_b32 v5, 0x7fff, 0, v3
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB9_11
+; GFX11-NEXT:    s_branch .LBB9_16
+; GFX11-NEXT:  .LBB9_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr5
+; GFX11-NEXT:  .LBB9_11: ; %frem.compute19
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v5, v7
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v9, v6
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v8, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v7, v5, 11
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v5, v6
+; GFX11-NEXT:    v_ldexp_f32 v6, v9, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v10, null, v6, v6, 1.0
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, -1, v5
+; GFX11-NEXT:    v_rcp_f32_e32 v11, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v7
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT:    v_not_b32_e32 v9, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v8
+; GFX11-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_mul_f32_e32 v12, v8, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v13, -v10, v12, v8
+; GFX11-NEXT:    v_fmac_f32_e32 v12, v13, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v8, -v10, v12, v8
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v8, v8, v11, v12
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v8, v8, v6, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB9_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB9_13: ; %frem.loop_body27
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v10, v7
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v7, v10, v8
+; GFX11-NEXT:    v_rndne_f32_e32 v7, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX11-NEXT:    v_fma_f32 v7, -v7, v6, v10
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_add_f32_e32 v9, v7, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f16 v1, -v1, v6, v4
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v7, v7, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB9_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v9, s2
+; GFX11-NEXT:    v_mov_b32_e32 v7, v10
+; GFX11-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -10, v9
+; GFX11-NEXT:    v_ldexp_f32 v7, v7, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v8, v7, v8
+; GFX11-NEXT:    v_rndne_f32_e32 v8, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 0x8000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX11-NEXT:    v_xor_b32_e32 v5, v6, v5
+; GFX11-NEXT:  .LBB9_16: ; %Flow57
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v3, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v4, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f16:
@@ -2174,62 +7568,235 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX1150-NEXT:    global_load_b32 v1, v1, s[4:5] offset:16
 ; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v6
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1150-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 s4, s2, 0x7fff
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX1150-NEXT:    s_cvt_f32_f16 s4, s4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
+; GFX1150-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, s4, v2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB9_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v1, 0x7fff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, s4, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB9_3
+; GFX1150-NEXT:    s_branch .LBB9_8
+; GFX1150-NEXT:  .LBB9_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr1
+; GFX1150-NEXT:  .LBB9_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, v2
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v1, v2
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1150-NEXT:    v_ldexp_f32 v3, v1, 11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v1, s4
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v4, v7, v6
+; GFX1150-NEXT:    v_not_b32_e32 v5, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX1150-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1150-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v5
+; GFX1150-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB9_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s4, s5, s4
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s4, s4, 11
+; GFX1150-NEXT:  .LBB9_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1150-NEXT:    s_add_i32 s4, s4, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s4, 11
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fma_f32 v3, v3, v2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB9_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow58
+; GFX1150-NEXT:    v_mov_b32_e32 v5, s4
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v6
+; GFX1150-NEXT:  .LBB9_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, -10, v5
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x8000, v0
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1150-NEXT:  .LBB9_8:
+; GFX1150-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX1150-NEXT:    s_lshr_b32 s4, s2, 16
+; GFX1150-NEXT:    s_and_b32 s5, s3, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s7, s4, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s6, s5
+; GFX1150-NEXT:    s_cvt_f32_f16 s5, s7
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s6, s5
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB9_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else20
+; GFX1150-NEXT:    s_cmp_eq_f32 s6, s5
+; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fff, 0, s3
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB9_11
+; GFX1150-NEXT:    s_branch .LBB9_16
+; GFX1150-NEXT:  .LBB9_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB9_11: ; %frem.compute19
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v3, s5
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s6
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v5, s6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    v_ldexp_f32 v4, v2, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, s5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v6, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX1150-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB9_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-NEXT:    s_sub_i32 s5, s6, s5
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s5, s5, 11
+; GFX1150-NEXT:  .LBB9_13: ; %frem.loop_body27
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1150-NEXT:    s_add_i32 s5, s5, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s5, 11
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB9_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v6, s5
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-NEXT:  .LBB9_15: ; %frem.loop_exit28
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX1150-NEXT:    s_and_b32 s5, s3, 0x8000
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v4, v2
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v2, s5, v2
+; GFX1150-NEXT:  .LBB9_16: ; %Flow57
+; GFX1150-NEXT:    s_cmp_neq_f16 s2, 0
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s2, s2, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s6, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s3, s3, 0x1f8
+; GFX1150-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s6
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, s5
+; GFX1150-NEXT:    s_cmp_neq_f16 s4, 0
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s4, s4, 0x3fc
+; GFX1150-NEXT:    v_dual_cndmask_b32 v0, 0x7e00, v1 :: v_dual_mov_b32 v1, 0
+; GFX1150-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 s3, s4, s3
+; GFX1150-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s3, s2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1150-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX1150-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -2244,366 +7811,1505 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-LABEL: frem_v4f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s8
-; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; SI-NEXT:    buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
-; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
-; SI-NEXT:    v_rcp_f32_e32 v10, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_and_b32_e32 v9, 0x7fffffff, v6
+; SI-NEXT:    v_and_b32_e32 v10, 0x7fffffff, v7
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v6|, |v7|
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; SI-NEXT:    s_cbranch_vccz .LBB10_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_bfi_b32 v11, s0, 0, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v9, v10
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB10_3
+; SI-NEXT:    s_branch .LBB10_8
+; SI-NEXT:  .LBB10_2:
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB10_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v9
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v8
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v8, v9
+; SI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v9, v8, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v8, v10
+; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v10
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v10
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v8, v8, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v8, 1.0
+; SI-NEXT:    v_div_scale_f32 v11, s[4:5], v8, v8, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v12, v11
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
-; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
-; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
-; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
-; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
+; SI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; SI-NEXT:    v_fma_f32 v12, v13, v12, v12
+; SI-NEXT:    v_mul_f32_e32 v13, v10, v12
+; SI-NEXT:    v_fma_f32 v14, -v11, v13, v10
+; SI-NEXT:    v_fma_f32 v13, v14, v12, v13
+; SI-NEXT:    v_fma_f32 v10, -v11, v13, v10
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
-; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
-; SI-NEXT:    v_trunc_f32_e32 v8, v8
-; SI-NEXT:    v_fma_f32 v1, -v8, v1, v5
-; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
-; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
-; SI-NEXT:    v_rcp_f32_e32 v9, v8
+; SI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
+; SI-NEXT:    v_div_fixup_f32 v10, v10, v8, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB10_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB10_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v11, v9
+; SI-NEXT:    v_mul_f32_e32 v9, v11, v10
+; SI-NEXT:    v_rndne_f32_e32 v9, v9
+; SI-NEXT:    v_fma_f32 v9, -v9, v8, v11
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; SI-NEXT:    v_add_f32_e32 v12, v9, v8
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB10_5
+; SI-NEXT:  ; %bb.6: ; %Flow142
+; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:  .LBB10_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, s1
+; SI-NEXT:    v_mul_f32_e32 v10, v9, v10
+; SI-NEXT:    v_rndne_f32_e32 v10, v10
+; SI-NEXT:    v_fma_f32 v9, -v10, v8, v9
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; SI-NEXT:    v_add_f32_e32 v8, v9, v8
+; SI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v8, v8, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_and_b32_e32 v9, 0x8000, v9
+; SI-NEXT:    v_xor_b32_e32 v8, v9, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:  .LBB10_8:
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v5
+; SI-NEXT:    v_cvt_f32_f16_e64 v10, |v9|
+; SI-NEXT:    v_cvt_f32_f16_e64 v11, |v11|
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v10, v11
+; SI-NEXT:    s_cbranch_vccz .LBB10_10
+; SI-NEXT:  ; %bb.9: ; %frem.else20
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v12, s0, 0, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v10, v11
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB10_11
+; SI-NEXT:    s_branch .LBB10_16
+; SI-NEXT:  .LBB10_10:
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB10_11: ; %frem.compute19
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v10
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v9
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v9, v10
+; SI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v10, v9, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v11|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v9, v11
+; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v11
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v11
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v11, vcc, 1.0, v9, 1.0
+; SI-NEXT:    v_div_scale_f32 v12, s[4:5], v9, v9, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v13, v12
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
-; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
-; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
-; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
-; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
-; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; SI-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; SI-NEXT:    v_fma_f32 v13, v14, v13, v13
+; SI-NEXT:    v_mul_f32_e32 v14, v11, v13
+; SI-NEXT:    v_fma_f32 v15, -v12, v14, v11
+; SI-NEXT:    v_fma_f32 v14, v15, v13, v14
+; SI-NEXT:    v_fma_f32 v11, -v12, v14, v11
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
-; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
-; SI-NEXT:    v_trunc_f32_e32 v5, v5
-; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_or_b32_e32 v1, v4, v1
-; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
-; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
-; SI-NEXT:    v_rcp_f32_e32 v7, v5
+; SI-NEXT:    v_div_fmas_f32 v11, v11, v13, v14
+; SI-NEXT:    v_div_fixup_f32 v11, v11, v9, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB10_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB10_13: ; %frem.loop_body27
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v12, v10
+; SI-NEXT:    v_mul_f32_e32 v10, v12, v11
+; SI-NEXT:    v_rndne_f32_e32 v10, v10
+; SI-NEXT:    v_fma_f32 v10, -v10, v9, v12
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; SI-NEXT:    v_add_f32_e32 v13, v10, v9
+; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB10_13
+; SI-NEXT:  ; %bb.14: ; %Flow138
+; SI-NEXT:    v_mov_b32_e32 v10, v12
+; SI-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, s1
+; SI-NEXT:    v_mul_f32_e32 v11, v10, v11
+; SI-NEXT:    v_rndne_f32_e32 v11, v11
+; SI-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; SI-NEXT:    v_add_f32_e32 v9, v10, v9
+; SI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0x8000, v10
+; SI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:  .LBB10_16:
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v3
+; SI-NEXT:    v_cvt_f32_f16_e64 v11, |v10|
+; SI-NEXT:    v_cvt_f32_f16_e64 v12, |v12|
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v11, v12
+; SI-NEXT:    s_cbranch_vccz .LBB10_18
+; SI-NEXT:  ; %bb.17: ; %frem.else56
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v13, s0, 0, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v11, v12
+; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB10_19
+; SI-NEXT:    s_branch .LBB10_24
+; SI-NEXT:  .LBB10_18:
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB10_19: ; %frem.compute55
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v11|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v11
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v10
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v10, v11
+; SI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v11, v10, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v12|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v10, v12
+; SI-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v12, v12
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v12
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v10, 1.0
+; SI-NEXT:    v_div_scale_f32 v13, s[4:5], v10, v10, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v14, v13
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
-; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
-; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
-; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
-; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
-; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
+; SI-NEXT:    v_fma_f32 v15, -v13, v14, 1.0
+; SI-NEXT:    v_fma_f32 v14, v15, v14, v14
+; SI-NEXT:    v_mul_f32_e32 v15, v12, v14
+; SI-NEXT:    v_fma_f32 v16, -v13, v15, v12
+; SI-NEXT:    v_fma_f32 v15, v16, v14, v15
+; SI-NEXT:    v_fma_f32 v12, -v13, v15, v12
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
-; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
-; SI-NEXT:    v_trunc_f32_e32 v4, v4
-; SI-NEXT:    v_fma_f32 v0, -v4, v0, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
-; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
-; SI-NEXT:    v_rcp_f32_e32 v5, v4
+; SI-NEXT:    v_div_fmas_f32 v12, v12, v14, v15
+; SI-NEXT:    v_div_fixup_f32 v12, v12, v10, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB10_23
+; SI-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB10_21: ; %frem.loop_body63
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v13, v11
+; SI-NEXT:    v_mul_f32_e32 v11, v13, v12
+; SI-NEXT:    v_rndne_f32_e32 v11, v11
+; SI-NEXT:    v_fma_f32 v11, -v11, v10, v13
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; SI-NEXT:    v_add_f32_e32 v14, v11, v10
+; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB10_21
+; SI-NEXT:  ; %bb.22: ; %Flow134
+; SI-NEXT:    v_mov_b32_e32 v11, v13
+; SI-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, s1
+; SI-NEXT:    v_mul_f32_e32 v12, v11, v12
+; SI-NEXT:    v_rndne_f32_e32 v12, v12
+; SI-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; SI-NEXT:    v_add_f32_e32 v10, v11, v10
+; SI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_and_b32_e32 v11, 0x8000, v11
+; SI-NEXT:    v_xor_b32_e32 v10, v11, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:  .LBB10_24:
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v1
+; SI-NEXT:    v_cvt_f32_f16_e64 v12, |v11|
+; SI-NEXT:    v_cvt_f32_f16_e64 v13, |v13|
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v12, v13
+; SI-NEXT:    s_cbranch_vccz .LBB10_26
+; SI-NEXT:  ; %bb.25: ; %frem.else92
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v14, s0, 0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v12, v13
+; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB10_27
+; SI-NEXT:    s_branch .LBB10_32
+; SI-NEXT:  .LBB10_26:
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB10_27: ; %frem.compute91
+; SI-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v12|, s3
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v12
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v11
+; SI-NEXT:    s_cselect_b32 s2, s0, 0
+; SI-NEXT:    v_frexp_mant_f32_e32 v11, v12
+; SI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v12, v11, 11
+; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v13|, s3
+; SI-NEXT:    v_frexp_mant_f32_e32 v11, v13
+; SI-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v13
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v13
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v11, 1.0
+; SI-NEXT:    v_div_scale_f32 v14, s[4:5], v11, v11, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v15, v14
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
-; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
-; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
-; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
-; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
-; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
+; SI-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
+; SI-NEXT:    v_fma_f32 v15, v16, v15, v15
+; SI-NEXT:    v_mul_f32_e32 v16, v13, v15
+; SI-NEXT:    v_fma_f32 v17, -v14, v16, v13
+; SI-NEXT:    v_fma_f32 v16, v17, v15, v16
+; SI-NEXT:    v_fma_f32 v13, -v14, v16, v13
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
-; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
-; SI-NEXT:    v_trunc_f32_e32 v3, v3
-; SI-NEXT:    v_fma_f32 v2, -v3, v6, v2
+; SI-NEXT:    v_div_fmas_f32 v13, v13, v15, v16
+; SI-NEXT:    v_div_fixup_f32 v13, v13, v11, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB10_31
+; SI-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 11
+; SI-NEXT:  .LBB10_29: ; %frem.loop_body99
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v14, v12
+; SI-NEXT:    v_mul_f32_e32 v12, v14, v13
+; SI-NEXT:    v_rndne_f32_e32 v12, v12
+; SI-NEXT:    v_fma_f32 v12, -v12, v11, v14
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; SI-NEXT:    v_add_f32_e32 v15, v12, v11
+; SI-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v12, v12, 11
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    s_cmp_gt_i32 s1, 11
+; SI-NEXT:    s_cbranch_scc1 .LBB10_29
+; SI-NEXT:  ; %bb.30: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v12, v14
+; SI-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; SI-NEXT:    s_add_i32 s1, s1, -10
+; SI-NEXT:    v_ldexp_f32_e64 v12, v12, s1
+; SI-NEXT:    v_mul_f32_e32 v13, v12, v13
+; SI-NEXT:    v_rndne_f32_e32 v13, v13
+; SI-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; SI-NEXT:    v_add_f32_e32 v11, v12, v11
+; SI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_and_b32_e32 v12, 0x8000, v12
+; SI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:  .LBB10_32: ; %Flow133
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v7
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_and_b32_e32 v7, 0x7fff, v7
+; SI-NEXT:    s_movk_i32 s4, 0x7c01
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fff, v6
+; SI-NEXT:    s_movk_i32 s5, 0x7c00
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v6
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v6, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v7, v6, v8, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_and_b32_e32 v5, 0x7fff, v5
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v1, v2, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; SI-NEXT:    v_or_b32_e32 v0, v7, v0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s6, s2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s0, s8
-; CI-NEXT:    s_mov_b32 s1, s9
-; CI-NEXT:    s_mov_b32 s8, s10
-; CI-NEXT:    s_mov_b32 s9, s11
-; CI-NEXT:    s_mov_b32 s10, s2
-; CI-NEXT:    s_mov_b32 s11, s3
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; CI-NEXT:    s_mov_b32 s7, s3
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
-; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
-; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
-; CI-NEXT:    v_rcp_f32_e32 v10, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v6|, |v7|
+; CI-NEXT:    v_and_b32_e32 v10, 0x7fffffff, v6
+; CI-NEXT:    v_and_b32_e32 v9, 0x7fffffff, v7
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB10_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_bfi_b32 v11, s0, 0, v6
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v10, v9
+; CI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; CI-NEXT:    s_cbranch_execz .LBB10_3
+; CI-NEXT:    s_branch .LBB10_8
+; CI-NEXT:  .LBB10_2:
+; CI-NEXT:    ; implicit-def: $vgpr8
+; CI-NEXT:  .LBB10_3: ; %frem.compute
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v10
+; CI-NEXT:    v_frexp_mant_f32_e32 v8, v10
+; CI-NEXT:    v_frexp_mant_f32_e32 v10, v9
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v9
+; CI-NEXT:    v_ldexp_f32_e64 v9, v10, 1
+; CI-NEXT:    v_div_scale_f32 v15, s[0:1], v9, v9, 1.0
+; CI-NEXT:    v_ldexp_f32_e64 v11, v8, 11
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v14
+; CI-NEXT:    v_not_b32_e32 v10, v8
+; CI-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CI-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v9, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v16, v15
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
-; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
-; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
-; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
-; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
+; CI-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
+; CI-NEXT:    v_fma_f32 v16, v17, v16, v16
+; CI-NEXT:    v_mul_f32_e32 v17, v12, v16
+; CI-NEXT:    v_fma_f32 v18, -v15, v17, v12
+; CI-NEXT:    v_fma_f32 v17, v18, v16, v17
+; CI-NEXT:    v_fma_f32 v12, -v15, v17, v12
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
-; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
-; CI-NEXT:    v_trunc_f32_e32 v8, v8
-; CI-NEXT:    v_fma_f32 v1, -v8, v1, v5
-; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
-; CI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
-; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_rcp_f32_e32 v9, v8
+; CI-NEXT:    v_div_fmas_f32 v12, v12, v16, v17
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v10
+; CI-NEXT:    v_div_fixup_f32 v12, v12, v9, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v10, vcc, v13, v14
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 11, v10
+; CI-NEXT:  .LBB10_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v13, v11
+; CI-NEXT:    v_mul_f32_e32 v11, v13, v12
+; CI-NEXT:    v_rndne_f32_e32 v11, v11
+; CI-NEXT:    v_fma_f32 v11, -v11, v9, v13
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; CI-NEXT:    v_add_f32_e32 v14, v11, v9
+; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -11, v10
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v10
+; CI-NEXT:    v_ldexp_f32_e64 v11, v11, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB10_5
+; CI-NEXT:  ; %bb.6: ; %Flow142
+; CI-NEXT:    v_mov_b32_e32 v11, v13
+; CI-NEXT:  .LBB10_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -10, v10
+; CI-NEXT:    v_ldexp_f32_e32 v10, v11, v10
+; CI-NEXT:    v_mul_f32_e32 v11, v10, v12
+; CI-NEXT:    v_rndne_f32_e32 v11, v11
+; CI-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; CI-NEXT:    v_add_f32_e32 v9, v10, v9
+; CI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v8, v9, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_and_b32_e32 v9, 0x8000, v9
+; CI-NEXT:    v_xor_b32_e32 v8, v9, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:  .LBB10_8:
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v5
+; CI-NEXT:    v_cvt_f32_f16_e64 v11, |v9|
+; CI-NEXT:    v_cvt_f32_f16_e64 v10, |v10|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v11, v10
+; CI-NEXT:    s_cbranch_vccz .LBB10_10
+; CI-NEXT:  ; %bb.9: ; %frem.else20
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v12, s0, 0, v4
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v11, v10
+; CI-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; CI-NEXT:    s_cbranch_execz .LBB10_11
+; CI-NEXT:    s_branch .LBB10_16
+; CI-NEXT:  .LBB10_10:
+; CI-NEXT:    ; implicit-def: $vgpr9
+; CI-NEXT:  .LBB10_11: ; %frem.compute19
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v11
+; CI-NEXT:    v_frexp_mant_f32_e32 v9, v11
+; CI-NEXT:    v_frexp_mant_f32_e32 v11, v10
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v15, v10
+; CI-NEXT:    v_ldexp_f32_e64 v10, v11, 1
+; CI-NEXT:    v_div_scale_f32 v16, s[0:1], v10, v10, 1.0
+; CI-NEXT:    v_ldexp_f32_e64 v12, v9, 11
+; CI-NEXT:    v_add_i32_e32 v9, vcc, -1, v15
+; CI-NEXT:    v_not_b32_e32 v11, v9
+; CI-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CI-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v10, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v17, v16
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
-; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
-; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
-; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
-; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
-; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
+; CI-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; CI-NEXT:    v_fma_f32 v17, v18, v17, v17
+; CI-NEXT:    v_mul_f32_e32 v18, v13, v17
+; CI-NEXT:    v_fma_f32 v19, -v16, v18, v13
+; CI-NEXT:    v_fma_f32 v18, v19, v17, v18
+; CI-NEXT:    v_fma_f32 v13, -v16, v18, v13
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
-; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
-; CI-NEXT:    v_trunc_f32_e32 v5, v5
-; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
-; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_or_b32_e32 v1, v4, v1
-; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
-; CI-NEXT:    v_rcp_f32_e32 v7, v5
+; CI-NEXT:    v_div_fmas_f32 v13, v13, v17, v18
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v11
+; CI-NEXT:    v_div_fixup_f32 v13, v13, v10, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT:    v_sub_i32_e32 v11, vcc, v14, v15
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 11, v11
+; CI-NEXT:  .LBB10_13: ; %frem.loop_body27
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v14, v12
+; CI-NEXT:    v_mul_f32_e32 v12, v14, v13
+; CI-NEXT:    v_rndne_f32_e32 v12, v12
+; CI-NEXT:    v_fma_f32 v12, -v12, v10, v14
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; CI-NEXT:    v_add_f32_e32 v15, v12, v10
+; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; CI-NEXT:    v_add_i32_e32 v11, vcc, -11, v11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v11
+; CI-NEXT:    v_ldexp_f32_e64 v12, v12, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB10_13
+; CI-NEXT:  ; %bb.14: ; %Flow138
+; CI-NEXT:    v_mov_b32_e32 v12, v14
+; CI-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT:    v_add_i32_e32 v11, vcc, -10, v11
+; CI-NEXT:    v_ldexp_f32_e32 v11, v12, v11
+; CI-NEXT:    v_mul_f32_e32 v12, v11, v13
+; CI-NEXT:    v_rndne_f32_e32 v12, v12
+; CI-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; CI-NEXT:    v_add_f32_e32 v10, v11, v10
+; CI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v9, v10, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_and_b32_e32 v10, 0x8000, v10
+; CI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:  .LBB10_16:
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v3
+; CI-NEXT:    v_cvt_f32_f16_e64 v12, |v10|
+; CI-NEXT:    v_cvt_f32_f16_e64 v11, |v11|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v12, v11
+; CI-NEXT:    s_cbranch_vccz .LBB10_18
+; CI-NEXT:  ; %bb.17: ; %frem.else56
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v13, s0, 0, v2
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v12, v11
+; CI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; CI-NEXT:    s_cbranch_execz .LBB10_19
+; CI-NEXT:    s_branch .LBB10_24
+; CI-NEXT:  .LBB10_18:
+; CI-NEXT:    ; implicit-def: $vgpr10
+; CI-NEXT:  .LBB10_19: ; %frem.compute55
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v15, v12
+; CI-NEXT:    v_frexp_mant_f32_e32 v10, v12
+; CI-NEXT:    v_frexp_mant_f32_e32 v12, v11
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v11
+; CI-NEXT:    v_ldexp_f32_e64 v11, v12, 1
+; CI-NEXT:    v_div_scale_f32 v17, s[0:1], v11, v11, 1.0
+; CI-NEXT:    v_ldexp_f32_e64 v13, v10, 11
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -1, v16
+; CI-NEXT:    v_not_b32_e32 v12, v10
+; CI-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CI-NEXT:    v_div_scale_f32 v14, vcc, 1.0, v11, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v18, v17
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
-; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
-; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
-; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
-; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
-; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
+; CI-NEXT:    v_fma_f32 v19, -v17, v18, 1.0
+; CI-NEXT:    v_fma_f32 v18, v19, v18, v18
+; CI-NEXT:    v_mul_f32_e32 v19, v14, v18
+; CI-NEXT:    v_fma_f32 v20, -v17, v19, v14
+; CI-NEXT:    v_fma_f32 v19, v20, v18, v19
+; CI-NEXT:    v_fma_f32 v14, -v17, v19, v14
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
-; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
-; CI-NEXT:    v_trunc_f32_e32 v4, v4
-; CI-NEXT:    v_fma_f32 v0, -v4, v0, v3
-; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
-; CI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; CI-NEXT:    v_rcp_f32_e32 v5, v4
+; CI-NEXT:    v_div_fmas_f32 v14, v14, v18, v19
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v12
+; CI-NEXT:    v_div_fixup_f32 v14, v14, v11, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_23
+; CI-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; CI-NEXT:    v_sub_i32_e32 v12, vcc, v15, v16
+; CI-NEXT:    v_add_i32_e32 v12, vcc, 11, v12
+; CI-NEXT:  .LBB10_21: ; %frem.loop_body63
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v15, v13
+; CI-NEXT:    v_mul_f32_e32 v13, v15, v14
+; CI-NEXT:    v_rndne_f32_e32 v13, v13
+; CI-NEXT:    v_fma_f32 v13, -v13, v11, v15
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; CI-NEXT:    v_add_f32_e32 v16, v13, v11
+; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; CI-NEXT:    v_add_i32_e32 v12, vcc, -11, v12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v12
+; CI-NEXT:    v_ldexp_f32_e64 v13, v13, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB10_21
+; CI-NEXT:  ; %bb.22: ; %Flow134
+; CI-NEXT:    v_mov_b32_e32 v13, v15
+; CI-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; CI-NEXT:    v_add_i32_e32 v12, vcc, -10, v12
+; CI-NEXT:    v_ldexp_f32_e32 v12, v13, v12
+; CI-NEXT:    v_mul_f32_e32 v13, v12, v14
+; CI-NEXT:    v_rndne_f32_e32 v13, v13
+; CI-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; CI-NEXT:    v_add_f32_e32 v11, v12, v11
+; CI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v10, v11, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_and_b32_e32 v11, 0x8000, v11
+; CI-NEXT:    v_xor_b32_e32 v10, v11, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:  .LBB10_24:
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v1
+; CI-NEXT:    v_cvt_f32_f16_e64 v13, |v11|
+; CI-NEXT:    v_cvt_f32_f16_e64 v12, |v12|
+; CI-NEXT:    v_cmp_ngt_f32_e32 vcc, v13, v12
+; CI-NEXT:    s_cbranch_vccz .LBB10_26
+; CI-NEXT:  ; %bb.25: ; %frem.else92
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v14, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v13, v12
+; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; CI-NEXT:    s_cbranch_execz .LBB10_27
+; CI-NEXT:    s_branch .LBB10_32
+; CI-NEXT:  .LBB10_26:
+; CI-NEXT:    ; implicit-def: $vgpr11
+; CI-NEXT:  .LBB10_27: ; %frem.compute91
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v13
+; CI-NEXT:    v_frexp_mant_f32_e32 v11, v13
+; CI-NEXT:    v_frexp_mant_f32_e32 v13, v12
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v17, v12
+; CI-NEXT:    v_ldexp_f32_e64 v12, v13, 1
+; CI-NEXT:    v_div_scale_f32 v18, s[0:1], v12, v12, 1.0
+; CI-NEXT:    v_ldexp_f32_e64 v14, v11, 11
+; CI-NEXT:    v_add_i32_e32 v11, vcc, -1, v17
+; CI-NEXT:    v_not_b32_e32 v13, v11
+; CI-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CI-NEXT:    v_div_scale_f32 v15, vcc, 1.0, v12, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v19, v18
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
-; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
-; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
-; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
-; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
-; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
+; CI-NEXT:    v_fma_f32 v20, -v18, v19, 1.0
+; CI-NEXT:    v_fma_f32 v19, v20, v19, v19
+; CI-NEXT:    v_mul_f32_e32 v20, v15, v19
+; CI-NEXT:    v_fma_f32 v21, -v18, v20, v15
+; CI-NEXT:    v_fma_f32 v20, v21, v19, v20
+; CI-NEXT:    v_fma_f32 v15, -v18, v20, v15
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
-; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
-; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v2, -v3, v6, v2
+; CI-NEXT:    v_div_fmas_f32 v15, v15, v19, v20
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v13
+; CI-NEXT:    v_div_fixup_f32 v15, v15, v12, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB10_31
+; CI-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; CI-NEXT:    v_sub_i32_e32 v13, vcc, v16, v17
+; CI-NEXT:    v_add_i32_e32 v13, vcc, 11, v13
+; CI-NEXT:  .LBB10_29: ; %frem.loop_body99
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v16, v14
+; CI-NEXT:    v_mul_f32_e32 v14, v16, v15
+; CI-NEXT:    v_rndne_f32_e32 v14, v14
+; CI-NEXT:    v_fma_f32 v14, -v14, v12, v16
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v14
+; CI-NEXT:    v_add_f32_e32 v17, v14, v12
+; CI-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
+; CI-NEXT:    v_add_i32_e32 v13, vcc, -11, v13
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v13
+; CI-NEXT:    v_ldexp_f32_e64 v14, v14, 11
+; CI-NEXT:    s_cbranch_vccnz .LBB10_29
+; CI-NEXT:  ; %bb.30: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v14, v16
+; CI-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; CI-NEXT:    v_add_i32_e32 v13, vcc, -10, v13
+; CI-NEXT:    v_ldexp_f32_e32 v13, v14, v13
+; CI-NEXT:    v_mul_f32_e32 v14, v13, v15
+; CI-NEXT:    v_rndne_f32_e32 v14, v14
+; CI-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; CI-NEXT:    v_add_f32_e32 v12, v13, v12
+; CI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v11, v12, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_and_b32_e32 v12, 0x8000, v12
+; CI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:  .LBB10_32: ; %Flow133
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    s_movk_i32 s4, 0x7c01
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_and_b32_e32 v7, 0x7fff, v7
+; CI-NEXT:    v_and_b32_e32 v6, 0x7fff, v6
+; CI-NEXT:    s_movk_i32 s5, 0x7c00
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v7
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v6
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v12
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v6, 0x7fc00000
+; CI-NEXT:    v_cndmask_b32_e32 v7, v6, v8, vcc
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_and_b32_e32 v5, 0x7fff, v5
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v9
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v5
+; CI-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v4
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v10
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_or_b32_e32 v0, v2, v0
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v2
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; CI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s5, v0
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    v_or_b32_e32 v1, v2, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_or_b32_e32 v0, v7, v0
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 32
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s11
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; VI-NEXT:    v_cvt_f32_f16_e32 v9, v8
+; VI-NEXT:    v_cvt_f32_f16_e64 v6, |v0|
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT:    v_rcp_f32_e32 v10, v9
-; VI-NEXT:    v_mul_f32_e32 v11, v7, v10
-; VI-NEXT:    v_mad_f32 v12, -v9, v11, v7
-; VI-NEXT:    v_mac_f32_e32 v11, v12, v10
-; VI-NEXT:    v_mad_f32 v7, -v9, v11, v7
-; VI-NEXT:    v_mul_f32_e32 v7, v7, v10
-; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
-; VI-NEXT:    v_add_f32_e32 v7, v7, v11
-; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
-; VI-NEXT:    v_trunc_f16_e32 v7, v7
-; VI-NEXT:    v_fma_f16 v6, -v7, v8, v6
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; VI-NEXT:    v_rcp_f32_e32 v9, v8
-; VI-NEXT:    v_mul_f32_e32 v10, v7, v9
-; VI-NEXT:    v_mad_f32 v11, -v8, v10, v7
-; VI-NEXT:    v_mac_f32_e32 v10, v11, v9
-; VI-NEXT:    v_mad_f32 v7, -v8, v10, v7
-; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
-; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
-; VI-NEXT:    v_add_f32_e32 v7, v7, v10
+; VI-NEXT:    v_cvt_f32_f16_e64 v5, |v2|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v5
+; VI-NEXT:    s_cbranch_vccz .LBB10_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    v_bfi_b32 v4, s0, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
+; VI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; VI-NEXT:    s_cbranch_execz .LBB10_3
+; VI-NEXT:    s_branch .LBB10_8
+; VI-NEXT:  .LBB10_2:
+; VI-NEXT:    ; implicit-def: $vgpr4
+; VI-NEXT:  .LBB10_3: ; %frem.compute
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v6
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; VI-NEXT:    v_frexp_mant_f32_e32 v6, v5
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v5
+; VI-NEXT:    v_ldexp_f32 v5, v6, 1
+; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; VI-NEXT:    v_ldexp_f32 v7, v4, 11
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v10
+; VI-NEXT:    v_not_b32_e32 v6, v4
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; VI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v12, v11
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
+; VI-NEXT:    v_mul_f32_e32 v13, v8, v12
+; VI-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
+; VI-NEXT:    v_fma_f32 v8, -v11, v13, v8
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v6
+; VI-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v9, v10
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 11, v6
+; VI-NEXT:  .LBB10_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    v_mul_f32_e32 v7, v9, v8
+; VI-NEXT:    v_rndne_f32_e32 v7, v7
+; VI-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT:    v_add_f32_e32 v10, v7, v5
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; VI-NEXT:    v_add_u32_e32 v6, vcc, -11, v6
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v6
+; VI-NEXT:    v_ldexp_f32 v7, v7, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB10_5
+; VI-NEXT:  ; %bb.6: ; %Flow142
+; VI-NEXT:    v_mov_b32_e32 v7, v9
+; VI-NEXT:  .LBB10_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v6, vcc, -10, v6
+; VI-NEXT:    v_ldexp_f32 v6, v7, v6
+; VI-NEXT:    v_mul_f32_e32 v7, v6, v8
+; VI-NEXT:    v_rndne_f32_e32 v7, v7
+; VI-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT:    v_add_f32_e32 v5, v6, v5
+; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; VI-NEXT:    v_ldexp_f32 v4, v5, v4
+; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; VI-NEXT:    v_and_b32_e32 v5, 0x8000, v0
+; VI-NEXT:    v_xor_b32_e32 v4, v5, v4
+; VI-NEXT:  .LBB10_8:
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; VI-NEXT:    v_cvt_f32_f16_e64 v9, |v5|
+; VI-NEXT:    v_cvt_f32_f16_e64 v8, |v6|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v9, v8
+; VI-NEXT:    s_cbranch_vccz .LBB10_10
+; VI-NEXT:  ; %bb.9: ; %frem.else20
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    v_bfi_b32 v7, s0, 0, v5
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v9, v8
+; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
+; VI-NEXT:    s_cbranch_execz .LBB10_11
+; VI-NEXT:    s_branch .LBB10_16
+; VI-NEXT:  .LBB10_10:
+; VI-NEXT:    ; implicit-def: $vgpr7
+; VI-NEXT:  .LBB10_11: ; %frem.compute19
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v12, v9
+; VI-NEXT:    v_frexp_mant_f32_e32 v7, v9
+; VI-NEXT:    v_frexp_mant_f32_e32 v9, v8
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v8
+; VI-NEXT:    v_ldexp_f32 v8, v9, 1
+; VI-NEXT:    v_div_scale_f32 v14, s[0:1], v8, v8, 1.0
+; VI-NEXT:    v_ldexp_f32 v10, v7, 11
+; VI-NEXT:    v_add_u32_e32 v7, vcc, -1, v13
+; VI-NEXT:    v_not_b32_e32 v9, v7
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v9, v12
+; VI-NEXT:    v_div_scale_f32 v11, vcc, 1.0, v8, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v15, v14
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
+; VI-NEXT:    v_fma_f32 v15, v16, v15, v15
+; VI-NEXT:    v_mul_f32_e32 v16, v11, v15
+; VI-NEXT:    v_fma_f32 v17, -v14, v16, v11
+; VI-NEXT:    v_fma_f32 v16, v17, v15, v16
+; VI-NEXT:    v_fma_f32 v11, -v14, v16, v11
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v11, v11, v15, v16
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v9
+; VI-NEXT:    v_div_fixup_f32 v11, v11, v8, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, v12, v13
+; VI-NEXT:    v_add_u32_e32 v9, vcc, 11, v9
+; VI-NEXT:  .LBB10_13: ; %frem.loop_body27
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v12, v10
+; VI-NEXT:    v_mul_f32_e32 v10, v12, v11
+; VI-NEXT:    v_rndne_f32_e32 v10, v10
+; VI-NEXT:    v_fma_f32 v10, -v10, v8, v12
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; VI-NEXT:    v_add_f32_e32 v13, v10, v8
+; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v9, vcc, -11, v9
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v9
+; VI-NEXT:    v_ldexp_f32 v10, v10, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB10_13
+; VI-NEXT:  ; %bb.14: ; %Flow138
+; VI-NEXT:    v_mov_b32_e32 v10, v12
+; VI-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT:    v_add_u32_e32 v9, vcc, -10, v9
+; VI-NEXT:    v_ldexp_f32 v9, v10, v9
+; VI-NEXT:    v_mul_f32_e32 v10, v9, v11
+; VI-NEXT:    v_rndne_f32_e32 v10, v10
+; VI-NEXT:    v_fma_f32 v9, -v10, v8, v9
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; VI-NEXT:    v_add_f32_e32 v8, v9, v8
+; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; VI-NEXT:    v_ldexp_f32 v7, v8, v7
 ; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
-; VI-NEXT:    v_trunc_f16_e32 v7, v7
-; VI-NEXT:    v_fma_f16 v3, -v7, v5, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-NEXT:    v_or_b32_e32 v3, v3, v6
-; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; VI-NEXT:    v_rcp_f32_e32 v9, v8
-; VI-NEXT:    v_mul_f32_e32 v10, v6, v9
-; VI-NEXT:    v_mad_f32 v11, -v8, v10, v6
-; VI-NEXT:    v_mac_f32_e32 v10, v11, v9
-; VI-NEXT:    v_mad_f32 v6, -v8, v10, v6
-; VI-NEXT:    v_mul_f32_e32 v6, v6, v9
-; VI-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; VI-NEXT:    v_add_f32_e32 v6, v6, v10
-; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
-; VI-NEXT:    v_trunc_f16_e32 v6, v6
-; VI-NEXT:    v_fma_f16 v5, -v6, v7, v5
-; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
-; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_rcp_f32_e32 v8, v7
-; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
-; VI-NEXT:    v_mad_f32 v10, -v7, v9, v6
-; VI-NEXT:    v_mac_f32_e32 v9, v10, v8
-; VI-NEXT:    v_mad_f32 v6, -v7, v9, v6
-; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
-; VI-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; VI-NEXT:    v_add_f32_e32 v6, v6, v9
-; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
-; VI-NEXT:    v_trunc_f16_e32 v6, v6
-; VI-NEXT:    v_fma_f16 v2, -v6, v4, v2
-; VI-NEXT:    v_or_b32_e32 v2, v2, v5
+; VI-NEXT:    v_and_b32_e32 v8, 0x8000, v5
+; VI-NEXT:    v_xor_b32_e32 v7, v8, v7
+; VI-NEXT:  .LBB10_16:
+; VI-NEXT:    v_cvt_f32_f16_e64 v10, |v1|
+; VI-NEXT:    v_cvt_f32_f16_e64 v9, |v3|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v10, v9
+; VI-NEXT:    s_cbranch_vccz .LBB10_18
+; VI-NEXT:  ; %bb.17: ; %frem.else56
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    v_bfi_b32 v8, s0, 0, v1
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v10, v9
+; VI-NEXT:    v_cndmask_b32_e32 v8, v1, v8, vcc
+; VI-NEXT:    s_cbranch_execz .LBB10_19
+; VI-NEXT:    s_branch .LBB10_24
+; VI-NEXT:  .LBB10_18:
+; VI-NEXT:    ; implicit-def: $vgpr8
+; VI-NEXT:  .LBB10_19: ; %frem.compute55
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v10
+; VI-NEXT:    v_frexp_mant_f32_e32 v8, v10
+; VI-NEXT:    v_frexp_mant_f32_e32 v10, v9
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v9
+; VI-NEXT:    v_ldexp_f32 v9, v10, 1
+; VI-NEXT:    v_div_scale_f32 v15, s[0:1], v9, v9, 1.0
+; VI-NEXT:    v_ldexp_f32 v11, v8, 11
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v14
+; VI-NEXT:    v_not_b32_e32 v10, v8
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v13
+; VI-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v9, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v16, v15
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
+; VI-NEXT:    v_fma_f32 v16, v17, v16, v16
+; VI-NEXT:    v_mul_f32_e32 v17, v12, v16
+; VI-NEXT:    v_fma_f32 v18, -v15, v17, v12
+; VI-NEXT:    v_fma_f32 v17, v18, v16, v17
+; VI-NEXT:    v_fma_f32 v12, -v15, v17, v12
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v12, v12, v16, v17
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v10
+; VI-NEXT:    v_div_fixup_f32 v12, v12, v9, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_23
+; VI-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, v13, v14
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 11, v10
+; VI-NEXT:  .LBB10_21: ; %frem.loop_body63
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v13, v11
+; VI-NEXT:    v_mul_f32_e32 v11, v13, v12
+; VI-NEXT:    v_rndne_f32_e32 v11, v11
+; VI-NEXT:    v_fma_f32 v11, -v11, v9, v13
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; VI-NEXT:    v_add_f32_e32 v14, v11, v9
+; VI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -11, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v10
+; VI-NEXT:    v_ldexp_f32 v11, v11, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB10_21
+; VI-NEXT:  ; %bb.22: ; %Flow134
+; VI-NEXT:    v_mov_b32_e32 v11, v13
+; VI-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -10, v10
+; VI-NEXT:    v_ldexp_f32 v10, v11, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v10, v12
+; VI-NEXT:    v_rndne_f32_e32 v11, v11
+; VI-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; VI-NEXT:    v_add_f32_e32 v9, v10, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; VI-NEXT:    v_ldexp_f32 v8, v9, v8
+; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; VI-NEXT:    v_and_b32_e32 v9, 0x8000, v1
+; VI-NEXT:    v_xor_b32_e32 v8, v9, v8
+; VI-NEXT:  .LBB10_24:
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; VI-NEXT:    v_cvt_f32_f16_e64 v13, |v9|
+; VI-NEXT:    v_cvt_f32_f16_e64 v12, |v10|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v13, v12
+; VI-NEXT:    s_cbranch_vccz .LBB10_26
+; VI-NEXT:  ; %bb.25: ; %frem.else92
+; VI-NEXT:    s_movk_i32 s0, 0x7fff
+; VI-NEXT:    v_bfi_b32 v11, s0, 0, v9
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v13, v12
+; VI-NEXT:    v_cndmask_b32_e32 v11, v9, v11, vcc
+; VI-NEXT:    s_cbranch_execz .LBB10_27
+; VI-NEXT:    s_branch .LBB10_32
+; VI-NEXT:  .LBB10_26:
+; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:  .LBB10_27: ; %frem.compute91
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v13
+; VI-NEXT:    v_frexp_mant_f32_e32 v11, v13
+; VI-NEXT:    v_frexp_mant_f32_e32 v13, v12
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v17, v12
+; VI-NEXT:    v_ldexp_f32 v12, v13, 1
+; VI-NEXT:    v_div_scale_f32 v18, s[0:1], v12, v12, 1.0
+; VI-NEXT:    v_ldexp_f32 v14, v11, 11
+; VI-NEXT:    v_add_u32_e32 v11, vcc, -1, v17
+; VI-NEXT:    v_not_b32_e32 v13, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v16
+; VI-NEXT:    v_div_scale_f32 v15, vcc, 1.0, v12, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v19, v18
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT:    v_fma_f32 v20, -v18, v19, 1.0
+; VI-NEXT:    v_fma_f32 v19, v20, v19, v19
+; VI-NEXT:    v_mul_f32_e32 v20, v15, v19
+; VI-NEXT:    v_fma_f32 v21, -v18, v20, v15
+; VI-NEXT:    v_fma_f32 v20, v21, v19, v20
+; VI-NEXT:    v_fma_f32 v15, -v18, v20, v15
+; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT:    v_div_fmas_f32 v15, v15, v19, v20
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v13
+; VI-NEXT:    v_div_fixup_f32 v15, v15, v12, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB10_31
+; VI-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; VI-NEXT:    v_sub_u32_e32 v13, vcc, v16, v17
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 11, v13
+; VI-NEXT:  .LBB10_29: ; %frem.loop_body99
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v16, v14
+; VI-NEXT:    v_mul_f32_e32 v14, v16, v15
+; VI-NEXT:    v_rndne_f32_e32 v14, v14
+; VI-NEXT:    v_fma_f32 v14, -v14, v12, v16
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v14
+; VI-NEXT:    v_add_f32_e32 v17, v14, v12
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, -11, v13
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v13
+; VI-NEXT:    v_ldexp_f32 v14, v14, 11
+; VI-NEXT:    s_cbranch_vccnz .LBB10_29
+; VI-NEXT:  ; %bb.30: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v14, v16
+; VI-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; VI-NEXT:    v_add_u32_e32 v13, vcc, -10, v13
+; VI-NEXT:    v_ldexp_f32 v13, v14, v13
+; VI-NEXT:    v_mul_f32_e32 v14, v13, v15
+; VI-NEXT:    v_rndne_f32_e32 v14, v14
+; VI-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; VI-NEXT:    v_add_f32_e32 v12, v13, v12
+; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; VI-NEXT:    v_ldexp_f32 v11, v12, v11
+; VI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; VI-NEXT:    v_and_b32_e32 v12, 0x8000, v9
+; VI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; VI-NEXT:  .LBB10_32: ; %Flow133
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v2
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v2, v12
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v0, v2
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v13, 0x7e00
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v6, v12
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v5, v2
+; VI-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v6
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v3, v12
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v1, v2
+; VI-NEXT:    v_cndmask_b32_e32 v5, v13, v7, vcc
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v3
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cmp_class_f16_e64 s[0:1], v10, v12
+; VI-NEXT:    v_cmp_class_f16_e64 s[2:3], v9, v2
+; VI-NEXT:    v_cndmask_b32_e32 v3, v13, v8, vcc
+; VI-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v10
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v13, v11, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v6, |v2|
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v9, v8
-; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX9-NEXT:    v_rcp_f32_e32 v9, v9
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
-; GFX9-NEXT:    v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mac_f32_e32 v5, v7, v6
-; GFX9-NEXT:    v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v5, |v0|
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v5
+; GFX9-NEXT:    s_cbranch_vccz .LBB10_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v4, s0, 0, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB10_3
+; GFX9-NEXT:    s_branch .LBB10_8
+; GFX9-NEXT:  .LBB10_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4
+; GFX9-NEXT:  .LBB10_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v9, v6
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v6, v5
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v10, v5
+; GFX9-NEXT:    v_ldexp_f32 v5, v6, 1
+; GFX9-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v7, v4, 11
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v10
+; GFX9-NEXT:    v_not_b32_e32 v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v9
+; GFX9-NEXT:    v_rcp_f32_e32 v12, v11
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; GFX9-NEXT:    v_fma_f32 v12, v13, v12, v12
+; GFX9-NEXT:    v_mul_f32_e32 v13, v8, v12
+; GFX9-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; GFX9-NEXT:    v_fma_f32 v13, v14, v12, v13
+; GFX9-NEXT:    v_fma_f32 v8, -v11, v13, v8
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v6
+; GFX9-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v6, v9, v10
+; GFX9-NEXT:    v_add_u32_e32 v6, 11, v6
+; GFX9-NEXT:  .LBB10_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v8
+; GFX9-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX9-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_add_f32_e32 v10, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, -11, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v6
+; GFX9-NEXT:    v_ldexp_f32 v7, v7, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow142
+; GFX9-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-NEXT:  .LBB10_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v6, -10, v6
+; GFX9-NEXT:    v_ldexp_f32 v6, v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v8
+; GFX9-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX9-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
-; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
-; GFX9-NEXT:    v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v7, v10, v9
-; GFX9-NEXT:    v_fma_f16 v5, -v5, v3, v1
-; GFX9-NEXT:    v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v1
-; GFX9-NEXT:    v_fma_f16 v1, -v1, v8, v6
-; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mac_f32_e32 v3, v6, v5
-; GFX9-NEXT:    v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v5, v6, v5
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
-; GFX9-NEXT:    v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX9-NEXT:    v_mac_f32_e32 v6, v9, v8
-; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v0
-; GFX9-NEXT:    v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
-; GFX9-NEXT:    v_trunc_f16_e32 v0, v0
-; GFX9-NEXT:    v_fma_f16 v0, -v0, v7, v5
-; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x8000, v2
+; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX9-NEXT:  .LBB10_8:
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v8, |v5|
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v8, v7
+; GFX9-NEXT:    s_cbranch_vccz .LBB10_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else20
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v6, s0, 0, v5
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v8, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB10_11
+; GFX9-NEXT:    s_branch .LBB10_16
+; GFX9-NEXT:  .LBB10_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr6
+; GFX9-NEXT:  .LBB10_11: ; %frem.compute19
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v11, v8
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v6, v8
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v8, v7
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v12, v7
+; GFX9-NEXT:    v_ldexp_f32 v7, v8, 1
+; GFX9-NEXT:    v_div_scale_f32 v13, s[0:1], v7, v7, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v7, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v9, v6, 11
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v12
+; GFX9-NEXT:    v_not_b32_e32 v8, v6
+; GFX9-NEXT:    v_add_u32_e32 v8, v8, v11
+; GFX9-NEXT:    v_rcp_f32_e32 v14, v13
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v15, -v13, v14, 1.0
+; GFX9-NEXT:    v_fma_f32 v14, v15, v14, v14
+; GFX9-NEXT:    v_mul_f32_e32 v15, v10, v14
+; GFX9-NEXT:    v_fma_f32 v16, -v13, v15, v10
+; GFX9-NEXT:    v_fma_f32 v15, v16, v14, v15
+; GFX9-NEXT:    v_fma_f32 v10, -v13, v15, v10
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v10, v10, v14, v15
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v8
+; GFX9-NEXT:    v_div_fixup_f32 v10, v10, v7, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v8, v11, v12
+; GFX9-NEXT:    v_add_u32_e32 v8, 11, v8
+; GFX9-NEXT:  .LBB10_13: ; %frem.loop_body27
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-NEXT:    v_mul_f32_e32 v9, v11, v10
+; GFX9-NEXT:    v_rndne_f32_e32 v9, v9
+; GFX9-NEXT:    v_fma_f32 v9, -v9, v7, v11
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_add_f32_e32 v12, v9, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, -11, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v8
+; GFX9-NEXT:    v_ldexp_f32 v9, v9, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow138
+; GFX9-NEXT:    v_mov_b32_e32 v9, v11
+; GFX9-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; GFX9-NEXT:    v_add_u32_e32 v8, -10, v8
+; GFX9-NEXT:    v_ldexp_f32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v10
+; GFX9-NEXT:    v_rndne_f32_e32 v9, v9
+; GFX9-NEXT:    v_fma_f32 v8, -v9, v7, v8
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    v_ldexp_f32 v6, v7, v6
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x8000, v5
+; GFX9-NEXT:    v_xor_b32_e32 v6, v7, v6
+; GFX9-NEXT:  .LBB10_16:
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v9, |v3|
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v8, |v1|
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v9, v8
+; GFX9-NEXT:    s_cbranch_vccz .LBB10_18
+; GFX9-NEXT:  ; %bb.17: ; %frem.else56
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v7, s0, 0, v3
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v7, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB10_19
+; GFX9-NEXT:    s_branch .LBB10_24
+; GFX9-NEXT:  .LBB10_18:
+; GFX9-NEXT:    ; implicit-def: $vgpr7
+; GFX9-NEXT:  .LBB10_19: ; %frem.compute55
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v12, v9
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v7, v9
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v9, v8
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v13, v8
+; GFX9-NEXT:    v_ldexp_f32 v8, v9, 1
+; GFX9-NEXT:    v_div_scale_f32 v14, s[0:1], v8, v8, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v11, vcc, 1.0, v8, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v10, v7, 11
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v13
+; GFX9-NEXT:    v_not_b32_e32 v9, v7
+; GFX9-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX9-NEXT:    v_rcp_f32_e32 v15, v14
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
+; GFX9-NEXT:    v_fma_f32 v15, v16, v15, v15
+; GFX9-NEXT:    v_mul_f32_e32 v16, v11, v15
+; GFX9-NEXT:    v_fma_f32 v17, -v14, v16, v11
+; GFX9-NEXT:    v_fma_f32 v16, v17, v15, v16
+; GFX9-NEXT:    v_fma_f32 v11, -v14, v16, v11
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v11, v11, v15, v16
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v9
+; GFX9-NEXT:    v_div_fixup_f32 v11, v11, v8, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_23
+; GFX9-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v9, v12, v13
+; GFX9-NEXT:    v_add_u32_e32 v9, 11, v9
+; GFX9-NEXT:  .LBB10_21: ; %frem.loop_body63
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v10
+; GFX9-NEXT:    v_mul_f32_e32 v10, v12, v11
+; GFX9-NEXT:    v_rndne_f32_e32 v10, v10
+; GFX9-NEXT:    v_fma_f32 v10, -v10, v8, v12
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_add_f32_e32 v13, v10, v8
+; GFX9-NEXT:    v_add_u32_e32 v9, -11, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v9
+; GFX9-NEXT:    v_ldexp_f32 v10, v10, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_21
+; GFX9-NEXT:  ; %bb.22: ; %Flow134
+; GFX9-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; GFX9-NEXT:    v_add_u32_e32 v9, -10, v9
+; GFX9-NEXT:    v_ldexp_f32 v9, v10, v9
+; GFX9-NEXT:    v_mul_f32_e32 v10, v9, v11
+; GFX9-NEXT:    v_rndne_f32_e32 v10, v10
+; GFX9-NEXT:    v_fma_f32 v9, -v10, v8, v9
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX9-NEXT:    v_ldexp_f32 v7, v8, v7
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x8000, v3
+; GFX9-NEXT:    v_xor_b32_e32 v7, v8, v7
+; GFX9-NEXT:  .LBB10_24:
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v11, |v8|
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v11, v10
+; GFX9-NEXT:    s_cbranch_vccz .LBB10_26
+; GFX9-NEXT:  ; %bb.25: ; %frem.else92
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v9, s0, 0, v8
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB10_27
+; GFX9-NEXT:    s_branch .LBB10_32
+; GFX9-NEXT:  .LBB10_26:
+; GFX9-NEXT:    ; implicit-def: $vgpr9
+; GFX9-NEXT:  .LBB10_27: ; %frem.compute91
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v14, v11
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v9, v11
+; GFX9-NEXT:    v_frexp_mant_f32_e32 v11, v10
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v15, v10
+; GFX9-NEXT:    v_ldexp_f32 v10, v11, 1
+; GFX9-NEXT:    v_div_scale_f32 v16, s[0:1], v10, v10, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v10, 1.0
+; GFX9-NEXT:    v_ldexp_f32 v12, v9, 11
+; GFX9-NEXT:    v_add_u32_e32 v9, -1, v15
+; GFX9-NEXT:    v_not_b32_e32 v11, v9
+; GFX9-NEXT:    v_add_u32_e32 v11, v11, v14
+; GFX9-NEXT:    v_rcp_f32_e32 v17, v16
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX9-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; GFX9-NEXT:    v_fma_f32 v17, v18, v17, v17
+; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v17
+; GFX9-NEXT:    v_fma_f32 v19, -v16, v18, v13
+; GFX9-NEXT:    v_fma_f32 v18, v19, v17, v18
+; GFX9-NEXT:    v_fma_f32 v13, -v16, v18, v13
+; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX9-NEXT:    v_div_fmas_f32 v13, v13, v17, v18
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 12, v11
+; GFX9-NEXT:    v_div_fixup_f32 v13, v13, v10, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_31
+; GFX9-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v11, v14, v15
+; GFX9-NEXT:    v_add_u32_e32 v11, 11, v11
+; GFX9-NEXT:  .LBB10_29: ; %frem.loop_body99
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v14, v13
+; GFX9-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX9-NEXT:    v_fma_f32 v12, -v12, v10, v14
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_add_f32_e32 v15, v12, v10
+; GFX9-NEXT:    v_add_u32_e32 v11, -11, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 11, v11
+; GFX9-NEXT:    v_ldexp_f32 v12, v12, 11
+; GFX9-NEXT:    s_cbranch_vccnz .LBB10_29
+; GFX9-NEXT:  ; %bb.30: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v12, v14
+; GFX9-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; GFX9-NEXT:    v_add_u32_e32 v11, -10, v11
+; GFX9-NEXT:    v_ldexp_f32 v11, v12, v11
+; GFX9-NEXT:    v_mul_f32_e32 v12, v11, v13
+; GFX9-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX9-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX9-NEXT:    v_ldexp_f32 v9, v10, v9
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x8000, v8
+; GFX9-NEXT:    v_xor_b32_e32 v9, v10, v9
+; GFX9-NEXT:  .LBB10_32: ; %Flow133
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x3fc
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[0:1], v0, v10
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[2:3], v2, v11
+; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f16_sdwa s[2:3], v0, v10 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_cmp_class_f16_e32 vcc, v5, v11
+; GFX9-NEXT:    v_cmp_neq_f16_sdwa s[0:1], v0, v12 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[0:1], v1, v10
+; GFX9-NEXT:    v_cmp_class_f16_e64 s[2:3], v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; GFX9-NEXT:    v_cmp_neq_f16_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f16_sdwa s[2:3], v1, v10 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cmp_class_f16_e32 vcc, v8, v11
+; GFX9-NEXT:    v_cmp_neq_f16_sdwa s[0:1], v1, v12 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_v4f16:
@@ -2614,72 +9320,330 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7] offset:32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v6, |v2|
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
-; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
-; GFX10-NEXT:    v_mad_f32 v9, -v6, v8, v5
-; GFX10-NEXT:    v_mac_f32_e32 v8, v9, v7
-; GFX10-NEXT:    v_mad_f32 v5, -v6, v8, v5
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
-; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX10-NEXT:    v_fma_f16 v5, -v5, v3, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v8, v7
-; GFX10-NEXT:    v_mul_f32_e32 v9, v6, v8
-; GFX10-NEXT:    v_mad_f32 v10, -v7, v9, v6
-; GFX10-NEXT:    v_mac_f32_e32 v9, v10, v8
-; GFX10-NEXT:    v_mad_f32 v6, -v7, v9, v6
-; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v5, |v0|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    s_cbranch_vccz .LBB10_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v2
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB10_3
+; GFX10-NEXT:    s_branch .LBB10_8
+; GFX10-NEXT:  .LBB10_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:  .LBB10_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v8, v5
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v7, v6
+; GFX10-NEXT:    v_ldexp_f32 v6, v4, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v5
+; GFX10-NEXT:    v_ldexp_f32 v5, v8, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX10-NEXT:    v_div_scale_f32 v9, s4, v5, v5, 1.0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX10-NEXT:    v_not_b32_e32 v8, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v8
+; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB10_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB10_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX10-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX10-NEXT:    v_fma_f32 v6, -v6, v5, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB10_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow142
+; GFX10-NEXT:    v_mov_b32_e32 v8, s2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v9
+; GFX10-NEXT:  .LBB10_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, -10, v8
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX10-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX10-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX10-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0x8000, v2
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX10-NEXT:  .LBB10_8:
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v8, |v5|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v8, v7
+; GFX10-NEXT:    s_cbranch_vccz .LBB10_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else20
+; GFX10-NEXT:    v_bfi_b32 v6, 0x7fff, 0, v5
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v8, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB10_11
+; GFX10-NEXT:    s_branch .LBB10_16
+; GFX10-NEXT:  .LBB10_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr6
+; GFX10-NEXT:  .LBB10_11: ; %frem.compute19
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v6, v8
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v10, v7
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v9, v8
+; GFX10-NEXT:    v_ldexp_f32 v8, v6, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v6, v7
+; GFX10-NEXT:    v_ldexp_f32 v7, v10, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v6
+; GFX10-NEXT:    v_div_scale_f32 v11, s4, v7, v7, 1.0
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX10-NEXT:    v_rcp_f32_e32 v12, v11
+; GFX10-NEXT:    v_not_b32_e32 v10, v6
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v10, v9
+; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, 1.0, v7, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v13, v9, v12
+; GFX10-NEXT:    v_fma_f32 v14, -v11, v13, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v13, v14, v12
+; GFX10-NEXT:    v_fma_f32 v9, -v11, v13, v9
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v12, v13
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v10
+; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB10_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB10_13: ; %frem.loop_body27
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v8
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v8, v11, v9
+; GFX10-NEXT:    v_rndne_f32_e32 v8, v8
+; GFX10-NEXT:    v_fma_f32 v8, -v8, v7, v11
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_add_f32_e32 v10, v8, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v8, v8, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB10_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow138
+; GFX10-NEXT:    v_mov_b32_e32 v10, s2
+; GFX10-NEXT:    v_mov_b32_e32 v8, v11
+; GFX10-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -10, v10
+; GFX10-NEXT:    v_ldexp_f32 v8, v8, v10
+; GFX10-NEXT:    v_mul_f32_e32 v9, v8, v9
+; GFX10-NEXT:    v_rndne_f32_e32 v9, v9
+; GFX10-NEXT:    v_fma_f32 v8, -v9, v7, v8
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v6, v7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 0x8000, v5
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX10-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
-; GFX10-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX10-NEXT:    v_fma_f16 v1, -v6, v3, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
-; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
-; GFX10-NEXT:    v_mad_f32 v8, -v5, v7, v3
-; GFX10-NEXT:    v_mac_f32_e32 v7, v8, v6
-; GFX10-NEXT:    v_mad_f32 v3, -v5, v7, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v6
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
-; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
-; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
-; GFX10-NEXT:    v_mad_f32 v9, -v6, v8, v5
-; GFX10-NEXT:    v_mac_f32_e32 v8, v9, v7
-; GFX10-NEXT:    v_mad_f32 v5, -v6, v8, v5
-; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
-; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX10-NEXT:    v_fma_f16 v0, -v5, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX10-NEXT:    v_xor_b32_e32 v6, v7, v6
+; GFX10-NEXT:  .LBB10_16:
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v9, |v3|
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v8, |v1|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT:    s_cbranch_vccz .LBB10_18
+; GFX10-NEXT:  ; %bb.17: ; %frem.else56
+; GFX10-NEXT:    v_bfi_b32 v7, 0x7fff, 0, v3
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v9, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v7, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB10_19
+; GFX10-NEXT:    s_branch .LBB10_24
+; GFX10-NEXT:  .LBB10_18:
+; GFX10-NEXT:    ; implicit-def: $vgpr7
+; GFX10-NEXT:  .LBB10_19: ; %frem.compute55
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v7, v9
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v11, v8
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v10, v9
+; GFX10-NEXT:    v_ldexp_f32 v9, v7, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v7, v8
+; GFX10-NEXT:    v_ldexp_f32 v8, v11, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v10
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX10-NEXT:    v_div_scale_f32 v12, s4, v8, v8, 1.0
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; GFX10-NEXT:    v_rcp_f32_e32 v13, v12
+; GFX10-NEXT:    v_not_b32_e32 v11, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v11, v10
+; GFX10-NEXT:    v_div_scale_f32 v10, vcc_lo, 1.0, v8, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v13, v14, v13
+; GFX10-NEXT:    v_mul_f32_e32 v14, v10, v13
+; GFX10-NEXT:    v_fma_f32 v15, -v12, v14, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v14, v15, v13
+; GFX10-NEXT:    v_fma_f32 v10, -v12, v14, v10
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v10, v10, v13, v14
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v11
+; GFX10-NEXT:    v_div_fixup_f32 v10, v10, v8, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB10_23
+; GFX10-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB10_21: ; %frem.loop_body63
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v9
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v9, v12, v10
+; GFX10-NEXT:    v_rndne_f32_e32 v9, v9
+; GFX10-NEXT:    v_fma_f32 v9, -v9, v8, v12
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_add_f32_e32 v11, v9, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v9, v9, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB10_21
+; GFX10-NEXT:  ; %bb.22: ; %Flow134
+; GFX10-NEXT:    v_mov_b32_e32 v11, s2
+; GFX10-NEXT:    v_mov_b32_e32 v9, v12
+; GFX10-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, -10, v11
+; GFX10-NEXT:    v_ldexp_f32 v9, v9, v11
+; GFX10-NEXT:    v_mul_f32_e32 v10, v9, v10
+; GFX10-NEXT:    v_rndne_f32_e32 v10, v10
+; GFX10-NEXT:    v_fma_f32 v9, -v10, v8, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v7, v8, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 0x8000, v3
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX10-NEXT:    v_xor_b32_e32 v7, v8, v7
+; GFX10-NEXT:  .LBB10_24:
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v11, |v8|
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v11, v10
+; GFX10-NEXT:    s_cbranch_vccz .LBB10_26
+; GFX10-NEXT:  ; %bb.25: ; %frem.else92
+; GFX10-NEXT:    v_bfi_b32 v9, 0x7fff, 0, v8
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB10_27
+; GFX10-NEXT:    s_branch .LBB10_32
+; GFX10-NEXT:  .LBB10_26:
+; GFX10-NEXT:    ; implicit-def: $vgpr9
+; GFX10-NEXT:  .LBB10_27: ; %frem.compute91
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v9, v11
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v13, v10
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v12, v11
+; GFX10-NEXT:    v_ldexp_f32 v11, v9, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v9, v10
+; GFX10-NEXT:    v_ldexp_f32 v10, v13, 1
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v12
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX10-NEXT:    v_div_scale_f32 v14, s4, v10, v10, 1.0
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT:    v_rcp_f32_e32 v15, v14
+; GFX10-NEXT:    v_not_b32_e32 v13, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, v13, v12
+; GFX10-NEXT:    v_div_scale_f32 v12, vcc_lo, 1.0, v10, 1.0
+; GFX10-NEXT:    s_denorm_mode 15
+; GFX10-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v16, v15
+; GFX10-NEXT:    v_mul_f32_e32 v16, v12, v15
+; GFX10-NEXT:    v_fma_f32 v17, -v14, v16, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v16, v17, v15
+; GFX10-NEXT:    v_fma_f32 v12, -v14, v16, v12
+; GFX10-NEXT:    s_denorm_mode 12
+; GFX10-NEXT:    v_div_fmas_f32 v12, v12, v15, v16
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v13
+; GFX10-NEXT:    v_div_fixup_f32 v12, v12, v10, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB10_31
+; GFX10-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 11
+; GFX10-NEXT:  .LBB10_29: ; %frem.loop_body99
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11
+; GFX10-NEXT:    s_add_i32 s2, s2, -11
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX10-NEXT:    v_mul_f32_e32 v11, v14, v12
+; GFX10-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX10-NEXT:    v_fma_f32 v11, -v11, v10, v14
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_add_f32_e32 v13, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v11, v11, 11
+; GFX10-NEXT:    s_cbranch_scc1 .LBB10_29
+; GFX10-NEXT:  ; %bb.30: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v13, s2
+; GFX10-NEXT:    v_mov_b32_e32 v11, v14
+; GFX10-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, -10, v13
+; GFX10-NEXT:    v_ldexp_f32 v11, v11, v13
+; GFX10-NEXT:    v_mul_f32_e32 v12, v11, v12
+; GFX10-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX10-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v9, v10, v9
+; GFX10-NEXT:    v_and_b32_e32 v10, 0x8000, v8
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX10-NEXT:    v_xor_b32_e32 v9, v10, v9
+; GFX10-NEXT:  .LBB10_32: ; %Flow133
+; GFX10-NEXT:    v_cmp_class_f16_e64 s2, v0, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s3, v2, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s4, v5, 0x1f8
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v4, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cmp_class_f16_sdwa s3, v0, v10 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX10-NEXT:    v_cmp_neq_f16_sdwa s2, v0, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_and_b32 s3, s3, s4
+; GFX10-NEXT:    v_cmp_class_f16_e64 s4, v8, 0x1f8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s3, s2
+; GFX10-NEXT:    v_cmp_class_f16_e64 s2, v1, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f16_e64 s3, v3, 0x1f8
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f16_sdwa s2, v1, v4 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f16_sdwa s3, v1, v10 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT:    s_and_b32 s3, s3, s4
+; GFX10-NEXT:    s_and_b32 vcc_lo, s3, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v9, vcc_lo
+; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -2688,214 +9652,879 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5] offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v6, |v0|
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v9, v9
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v5, |v2|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    s_cbranch_vccz .LBB10_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB10_3
+; GFX11-NEXT:    s_branch .LBB10_8
+; GFX11-NEXT:  .LBB10_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:  .LBB10_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v8, v5
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v6, v4, 11
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v4, v5
+; GFX11-NEXT:    v_ldexp_f32 v5, v8, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v9, null, v5, v5, 1.0
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX11-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v8, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v6
-; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_mul_f32_e32 v11, v7, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB10_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB10_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v9, v6
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX11-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v6, -v6, v5, v9
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v6, v6, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB10_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow142
+; GFX11-NEXT:    v_mov_b32_e32 v8, s2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v9
+; GFX11-NEXT:  .LBB10_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, -10, v8
+; GFX11-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX11-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
 ; GFX11-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v9
-; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX11-NEXT:  .LBB10_8:
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_fma_f16 v5, -v5, v3, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v9, |v5|
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v8, |v6|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v9, v8
+; GFX11-NEXT:    s_cbranch_vccz .LBB10_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else20
+; GFX11-NEXT:    v_bfi_b32 v7, 0x7fff, 0, v5
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v9, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB10_11
+; GFX11-NEXT:    s_branch .LBB10_16
+; GFX11-NEXT:  .LBB10_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr7
+; GFX11-NEXT:  .LBB10_11: ; %frem.compute19
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v7, v9
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v11, v8
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v9, v7, 11
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v7, v8
+; GFX11-NEXT:    v_ldexp_f32 v8, v11, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v10
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v12, null, v8, v8, 1.0
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, -1, v7
+; GFX11-NEXT:    v_rcp_f32_e32 v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v11, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v10
+; GFX11-NEXT:    v_div_scale_f32 v10, vcc_lo, 1.0, v8, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v13, v14, v13
+; GFX11-NEXT:    v_mul_f32_e32 v14, v10, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v15, -v12, v14, v10
+; GFX11-NEXT:    v_fmac_f32_e32 v14, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v10, -v12, v14, v10
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v10, v10, v13, v14
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v10, v10, v8, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB10_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB10_13: ; %frem.loop_body27
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v12, v9
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v9, v12, v10
+; GFX11-NEXT:    v_rndne_f32_e32 v9, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v9, -v9, v8, v12
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_add_f32_e32 v11, v9, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v9, v9, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB10_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow138
+; GFX11-NEXT:    v_mov_b32_e32 v11, s2
+; GFX11-NEXT:    v_mov_b32_e32 v9, v12
+; GFX11-NEXT:  .LBB10_15: ; %frem.loop_exit28
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v10, v9
-; GFX11-NEXT:    v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, -10, v11
+; GFX11-NEXT:    v_ldexp_f32 v9, v9, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v10, v9, v10
+; GFX11-NEXT:    v_rndne_f32_e32 v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v9, -v10, v8, v9
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v9
+; GFX11-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v7, v8, v7
+; GFX11-NEXT:    v_and_b32_e32 v8, 0x8000, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX11-NEXT:    v_xor_b32_e32 v7, v8, v7
+; GFX11-NEXT:  .LBB10_16:
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v10, |v1|
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v9, |v3|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v10, v9
+; GFX11-NEXT:    s_cbranch_vccz .LBB10_18
+; GFX11-NEXT:  ; %bb.17: ; %frem.else56
+; GFX11-NEXT:    v_bfi_b32 v8, 0x7fff, 0, v1
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v1, v8, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB10_19
+; GFX11-NEXT:    s_branch .LBB10_24
+; GFX11-NEXT:  .LBB10_18:
+; GFX11-NEXT:    ; implicit-def: $vgpr8
+; GFX11-NEXT:  .LBB10_19: ; %frem.compute55
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v8, v10
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v12, v9
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v10, v8, 11
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v8, v9
+; GFX11-NEXT:    v_ldexp_f32 v9, v12, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v11
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v13, null, v9, v9, 1.0
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, -1, v8
+; GFX11-NEXT:    v_rcp_f32_e32 v14, v13
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f16 v1, -v1, v8, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; GFX11-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX11-NEXT:    v_not_b32_e32 v12, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v11
+; GFX11-NEXT:    v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX11-NEXT:    v_fma_f32 v15, -v13, v14, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v6, v5
+; GFX11-NEXT:    v_fmac_f32_e32 v14, v15, v14
+; GFX11-NEXT:    v_mul_f32_e32 v15, v11, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX11-NEXT:    v_fma_f32 v16, -v13, v15, v11
+; GFX11-NEXT:    v_fmac_f32_e32 v15, v16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v11, -v13, v15, v11
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v11, v11, v14, v15
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v11, v11, v9, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB10_23
+; GFX11-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB10_21: ; %frem.loop_body63
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v13, v10
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
-; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_mul_f32_e32 v10, v13, v11
+; GFX11-NEXT:    v_rndne_f32_e32 v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v10, -v10, v9, v13
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_add_f32_e32 v12, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v10, v10, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB10_21
+; GFX11-NEXT:  ; %bb.22: ; %Flow134
+; GFX11-NEXT:    v_mov_b32_e32 v12, s2
+; GFX11-NEXT:    v_mov_b32_e32 v10, v13
+; GFX11-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, -10, v12
+; GFX11-NEXT:    v_ldexp_f32 v10, v10, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v11, v10, v11
+; GFX11-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v8, v9, v8
+; GFX11-NEXT:    v_and_b32_e32 v9, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX11-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GFX11-NEXT:  .LBB10_24:
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v0
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v13, |v9|
+; GFX11-NEXT:    v_cvt_f32_f16_e64 v12, |v10|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v13, v12
+; GFX11-NEXT:    s_cbranch_vccz .LBB10_26
+; GFX11-NEXT:  ; %bb.25: ; %frem.else92
+; GFX11-NEXT:    v_bfi_b32 v11, 0x7fff, 0, v9
+; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v9, v11, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB10_27
+; GFX11-NEXT:    s_branch .LBB10_32
+; GFX11-NEXT:  .LBB10_26:
+; GFX11-NEXT:    ; implicit-def: $vgpr11
+; GFX11-NEXT:  .LBB10_27: ; %frem.compute91
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v11, v13
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v15, v12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v14, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v13, v11, 11
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v11, v12
+; GFX11-NEXT:    v_ldexp_f32 v12, v15, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v14
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_scale_f32 v16, null, v12, v12, 1.0
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11-NEXT:    v_rcp_f32_e32 v17, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v15, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v14
+; GFX11-NEXT:    v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0
+; GFX11-NEXT:    s_denorm_mode 15
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v17, v18, v17
+; GFX11-NEXT:    v_mul_f32_e32 v18, v14, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v19, -v16, v18, v14
+; GFX11-NEXT:    v_fmac_f32_e32 v18, v19, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v14, -v16, v18, v14
+; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v14, v14, v17, v18
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v14, v14, v12, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB10_31
+; GFX11-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 11
+; GFX11-NEXT:  .LBB10_29: ; %frem.loop_body99
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v16, v13
+; GFX11-NEXT:    s_add_i32 s2, s2, -11
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v13, v16, v14
+; GFX11-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v13, -v13, v12, v16
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v13
+; GFX11-NEXT:    v_add_f32_e32 v15, v13, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v13, v13, 11
+; GFX11-NEXT:    s_cbranch_scc1 .LBB10_29
+; GFX11-NEXT:  ; %bb.30: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v15, s2
+; GFX11-NEXT:    v_mov_b32_e32 v13, v16
+; GFX11-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, -10, v15
+; GFX11-NEXT:    v_ldexp_f32 v13, v13, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v6
+; GFX11-NEXT:    v_mul_f32_e32 v14, v13, v14
+; GFX11-NEXT:    v_rndne_f32_e32 v14, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
+; GFX11-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v13
+; GFX11-NEXT:    v_add_f32_e32 v12, v13, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
-; GFX11-NEXT:    v_fma_f16 v0, -v0, v7, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v11, v12, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0x8000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX11-NEXT:    v_xor_b32_e32 v11, v12, v11
+; GFX11-NEXT:  .LBB10_32: ; %Flow133
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v2, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v5, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v6, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v1, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v3, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f16_e64 s3, v9, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, v10, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f16_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v11, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
+; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v4f16:
 ; GFX1150:       ; %bb.0:
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1150-NEXT:    global_load_b64 v[1:2], v2, s[6:7] offset:32
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1150-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    s_and_b32 s6, s4, 0x7fff
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX1150-NEXT:    s_cvt_f32_f16 s6, s6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
+; GFX1150-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, s6, v2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB10_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v1, 0x7fff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, s6, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB10_3
+; GFX1150-NEXT:    s_branch .LBB10_8
+; GFX1150-NEXT:  .LBB10_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr1
+; GFX1150-NEXT:  .LBB10_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, v2
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v1, v2
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1150-NEXT:    v_ldexp_f32 v3, v1, 11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v1, s6
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v5, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX1150-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1150-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v5
+; GFX1150-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB10_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s6, s7, s6
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s6, s6, 11
+; GFX1150-NEXT:  .LBB10_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1150-NEXT:    s_add_i32 s6, s6, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s6, 11
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fma_f32 v3, v3, v2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB10_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow142
+; GFX1150-NEXT:    v_mov_b32_e32 v5, s6
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v6
+; GFX1150-NEXT:  .LBB10_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, -10, v5
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT:    v_add_f32_e32 v6, v8, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x8000, v0
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1150-NEXT:  .LBB10_8:
+; GFX1150-NEXT:    s_lshr_b32 s5, s5, 16
+; GFX1150-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX1150-NEXT:    s_and_b32 s7, s5, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s9, s6, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s8, s7
+; GFX1150-NEXT:    s_cvt_f32_f16 s7, s9
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s8, s7
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB10_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else20
+; GFX1150-NEXT:    s_cmp_eq_f32 s8, s7
+; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fff, 0, s5
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, s5, v2, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB10_11
+; GFX1150-NEXT:    s_branch .LBB10_16
+; GFX1150-NEXT:  .LBB10_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB10_11: ; %frem.compute19
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v3, s7
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v2, s8
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v5, s8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    v_ldexp_f32 v4, v2, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, s7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s8, v5
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-NEXT:    v_not_b32_e32 v6, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX1150-NEXT:    v_fma_f32 v5, -v7, v9, v5
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v6
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB10_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-NEXT:    s_sub_i32 s7, s8, s7
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s7, s7, 11
+; GFX1150-NEXT:  .LBB10_13: ; %frem.loop_body27
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1150-NEXT:    s_add_i32 s7, s7, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s7, 11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB10_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow138
+; GFX1150-NEXT:    v_mov_b32_e32 v6, s7
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-NEXT:  .LBB10_15: ; %frem.loop_exit28
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -10, v6
+; GFX1150-NEXT:    s_and_b32 s7, s5, 0x8000
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT:    v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v2, s7, v2
+; GFX1150-NEXT:  .LBB10_16:
+; GFX1150-NEXT:    s_and_b32 s7, s2, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s9, s3, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s8, s7
+; GFX1150-NEXT:    s_cvt_f32_f16 s7, s9
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s8, s7
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB10_18
+; GFX1150-NEXT:  ; %bb.17: ; %frem.else56
+; GFX1150-NEXT:    s_cmp_eq_f32 s8, s7
+; GFX1150-NEXT:    v_bfi_b32 v3, 0x7fff, 0, s2
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, s2, v3, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB10_19
+; GFX1150-NEXT:    s_branch .LBB10_24
+; GFX1150-NEXT:  .LBB10_18:
+; GFX1150-NEXT:    ; implicit-def: $vgpr3
+; GFX1150-NEXT:  .LBB10_19: ; %frem.compute55
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v4, s7
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v3, s8
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v6, s8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 1
+; GFX1150-NEXT:    v_ldexp_f32 v5, v3, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v3, s7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s8, v6
+; GFX1150-NEXT:    v_div_scale_f32 v8, null, v4, v4, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1150-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX1150-NEXT:    v_rcp_f32_e32 v9, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v7, v3
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX1150-NEXT:    v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v10, v6, v9
+; GFX1150-NEXT:    v_fma_f32 v11, -v8, v10, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v9
+; GFX1150-NEXT:    v_fma_f32 v6, -v8, v10, v6
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v7
+; GFX1150-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB10_23
+; GFX1150-NEXT:  ; %bb.20: ; %frem.loop_body63.preheader
+; GFX1150-NEXT:    s_sub_i32 s7, s8, s7
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s7, s7, 11
+; GFX1150-NEXT:  .LBB10_21: ; %frem.loop_body63
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1150-NEXT:    s_add_i32 s7, s7, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s7, 11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v8, v6
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fma_f32 v5, v5, v4, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v7, v5, v4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB10_21
+; GFX1150-NEXT:  ; %bb.22: ; %Flow134
+; GFX1150-NEXT:    v_mov_b32_e32 v7, s7
+; GFX1150-NEXT:    v_mov_b32_e32 v5, v8
+; GFX1150-NEXT:  .LBB10_23: ; %frem.loop_exit64
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, -10, v7
+; GFX1150-NEXT:    s_and_b32 s7, s2, 0x8000
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX1150-NEXT:    v_rndne_f32_e32 v6, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT:    v_ldexp_f32 v3, v4, v3
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, s7, v3
+; GFX1150-NEXT:  .LBB10_24:
+; GFX1150-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX1150-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX1150-NEXT:    s_and_b32 s9, s7, 0x7fff
+; GFX1150-NEXT:    s_and_b32 s11, s8, 0x7fff
+; GFX1150-NEXT:    s_cvt_f32_f16 s10, s9
+; GFX1150-NEXT:    s_cvt_f32_f16 s9, s11
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s10, s9
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB10_26
+; GFX1150-NEXT:  ; %bb.25: ; %frem.else92
+; GFX1150-NEXT:    s_cmp_eq_f32 s10, s9
+; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fff, 0, s7
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, s7, v4, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB10_27
+; GFX1150-NEXT:    s_branch .LBB10_32
+; GFX1150-NEXT:  .LBB10_26:
+; GFX1150-NEXT:    ; implicit-def: $vgpr4
+; GFX1150-NEXT:  .LBB10_27: ; %frem.compute91
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v5, s9
+; GFX1150-NEXT:    v_frexp_mant_f32_e32 v4, s10
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v7, s10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 1
+; GFX1150-NEXT:    v_ldexp_f32 v6, v4, 11
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, s9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX1150-NEXT:    v_div_scale_f32 v9, null, v5, v5, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s9, v4
+; GFX1150-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX1150-NEXT:    v_rcp_f32_e32 v10, v9
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT:    v_not_b32_e32 v8, v4
+; GFX1150-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX1150-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v10
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX1150-NEXT:    v_fma_f32 v12, -v9, v11, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX1150-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 12, v8
+; GFX1150-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB10_31
+; GFX1150-NEXT:  ; %bb.28: ; %frem.loop_body99.preheader
+; GFX1150-NEXT:    s_sub_i32 s9, s10, s9
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s9, s9, 11
+; GFX1150-NEXT:  .LBB10_29: ; %frem.loop_body99
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1150-NEXT:    s_add_i32 s9, s9, -11
+; GFX1150-NEXT:    s_cmp_gt_i32 s9, 11
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX1150-NEXT:    v_rndne_f32_e32 v6, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1150-NEXT:    v_fma_f32 v6, v6, v5, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX1150-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX1150-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v6, v6, 11
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB10_29
+; GFX1150-NEXT:  ; %bb.30: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v8, s9
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v9
+; GFX1150-NEXT:  .LBB10_31: ; %frem.loop_exit100
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v8, -10, v8
+; GFX1150-NEXT:    s_and_b32 s9, s7, 0x8000
+; GFX1150-NEXT:    v_ldexp_f32 v6, v6, v8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX1150-NEXT:    v_rndne_f32_e32 v7, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v4, s9, v4
+; GFX1150-NEXT:  .LBB10_32: ; %Flow133
+; GFX1150-NEXT:    s_cmp_neq_f16 s4, 0
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s4, s4, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s10, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s5, s5, 0x1f8
+; GFX1150-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s4, s4, s10
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s4, s9
+; GFX1150-NEXT:    s_cmp_neq_f16 s6, 0
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s6, s6, 0x3fc
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX1150-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s5, s6, s5
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s5, s4
+; GFX1150-NEXT:    s_cmp_neq_f16 s3, 0
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s3, s3, 0x3fc
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
+; GFX1150-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    s_and_b32 s2, s3, s2
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s3, s8, 0x3fc
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, s4
+; GFX1150-NEXT:    v_cmp_class_f16_e64 s4, s7, 0x1f8
+; GFX1150-NEXT:    s_cmp_neq_f16 s8, 0
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo
+; GFX1150-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX1150-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1150-NEXT:    s_and_b32 s3, s3, s4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s3, s2
+; GFX1150-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
+; GFX1150-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
+; GFX1150-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -2910,194 +10539,701 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-LABEL: frem_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s8
-; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
-; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
-; SI-NEXT:    v_rcp_f32_e32 v6, v5
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v2|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB11_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v4, s0, 0, v0
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
+; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB11_3
+; SI-NEXT:    s_branch .LBB11_8
+; SI-NEXT:  .LBB11_2:
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB11_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v4, v0
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v4
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v4, |v0|
+; SI-NEXT:    v_cndmask_b32_e64 v4, |v0|, v4, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v5, v4, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v4, |v2|
+; SI-NEXT:    v_cndmask_b32_e64 v4, |v2|, v4, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v6, v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v6
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v4, 1.0
+; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v4, v4, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v8, v7
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
-; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
-; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
+; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
+; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
+; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
+; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
+; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
-; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
-; SI-NEXT:    v_trunc_f32_e32 v4, v4
-; SI-NEXT:    v_fma_f32 v1, -v4, v3, v1
-; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
-; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; SI-NEXT:    v_rcp_f32_e32 v5, v4
+; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
+; SI-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB11_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB11_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mul_f32_e32 v5, v7, v6
+; SI-NEXT:    v_rndne_f32_e32 v5, v5
+; SI-NEXT:    v_fma_f32 v5, -v5, v4, v7
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; SI-NEXT:    v_add_f32_e32 v8, v5, v4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB11_5
+; SI-NEXT:  ; %bb.6: ; %Flow54
+; SI-NEXT:    v_mov_b32_e32 v5, v7
+; SI-NEXT:  .LBB11_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, s1
+; SI-NEXT:    v_mul_f32_e32 v6, v5, v6
+; SI-NEXT:    v_rndne_f32_e32 v6, v6
+; SI-NEXT:    v_fma_f32 v5, -v6, v4, v5
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v5
+; SI-NEXT:    v_add_f32_e32 v4, v5, v4
+; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v4, v4, s0
+; SI-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
+; SI-NEXT:    v_xor_b32_e32 v4, v5, v4
+; SI-NEXT:  .LBB11_8:
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v3|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB11_10
+; SI-NEXT:  ; %bb.9: ; %frem.else16
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v5, s0, 0, v1
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB11_11
+; SI-NEXT:    s_branch .LBB11_16
+; SI-NEXT:  .LBB11_10:
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB11_11: ; %frem.compute15
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v5, v1
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v5
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
+; SI-NEXT:    v_cndmask_b32_e64 v5, |v1|, v5, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v6, v5, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v3|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v5, |v3|
+; SI-NEXT:    v_cndmask_b32_e64 v5, |v3|, v5, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v3
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v7
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v7, vcc, 1.0, v5, 1.0
+; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v5, v5, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v9, v8
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
-; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
-; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
+; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
+; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
+; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
+; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
-; SI-NEXT:    v_trunc_f32_e32 v3, v3
-; SI-NEXT:    v_fma_f32 v0, -v3, v2, v0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
+; SI-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB11_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB11_13: ; %frem.loop_body23
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NEXT:    v_mul_f32_e32 v6, v8, v7
+; SI-NEXT:    v_rndne_f32_e32 v6, v6
+; SI-NEXT:    v_fma_f32 v6, -v6, v5, v8
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; SI-NEXT:    v_add_f32_e32 v9, v6, v5
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v6, v6, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB11_13
+; SI-NEXT:  ; %bb.14: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v6, v6, s1
+; SI-NEXT:    v_mul_f32_e32 v7, v6, v7
+; SI-NEXT:    v_rndne_f32_e32 v7, v7
+; SI-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; SI-NEXT:    v_add_f32_e32 v5, v6, v5
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v5, v5, s0
+; SI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; SI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; SI-NEXT:  .LBB11_16: ; %Flow53
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v6
+; SI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v3
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v3, v6
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v2
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_v2f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s6, s2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s0, s8
-; CI-NEXT:    s_mov_b32 s1, s9
-; CI-NEXT:    s_mov_b32 s8, s10
-; CI-NEXT:    s_mov_b32 s9, s11
-; CI-NEXT:    s_mov_b32 s10, s2
-; CI-NEXT:    s_mov_b32 s11, s3
-; CI-NEXT:    s_mov_b32 s7, s3
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0 offset:32
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
-; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
-; CI-NEXT:    v_rcp_f32_e32 v6, v5
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v2|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB11_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v4, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
+; CI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; CI-NEXT:    s_cbranch_execz .LBB11_3
+; CI-NEXT:    s_branch .LBB11_8
+; CI-NEXT:  .LBB11_2:
+; CI-NEXT:    ; implicit-def: $vgpr4
+; CI-NEXT:  .LBB11_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v5, |v2|
+; CI-NEXT:    v_ldexp_f32_e64 v5, v5, 1
+; CI-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v4, |v0|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v2
+; CI-NEXT:    v_ldexp_f32_e64 v7, v4, 12
+; CI-NEXT:    v_add_i32_e32 v4, vcc, -1, v10
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v0
+; CI-NEXT:    v_not_b32_e32 v6, v4
+; CI-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v12, v11
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
-; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
-; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
-; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
-; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
+; CI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; CI-NEXT:    v_fma_f32 v12, v13, v12, v12
+; CI-NEXT:    v_mul_f32_e32 v13, v8, v12
+; CI-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; CI-NEXT:    v_fma_f32 v13, v14, v12, v13
+; CI-NEXT:    v_fma_f32 v8, -v11, v13, v8
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
-; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
-; CI-NEXT:    v_trunc_f32_e32 v4, v4
-; CI-NEXT:    v_fma_f32 v1, -v4, v3, v1
-; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
-; CI-NEXT:    v_rcp_f32_e32 v5, v4
+; CI-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v6
+; CI-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB11_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v6, vcc, v9, v10
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 12, v6
+; CI-NEXT:  .LBB11_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v9, v7
+; CI-NEXT:    v_mul_f32_e32 v7, v9, v8
+; CI-NEXT:    v_rndne_f32_e32 v7, v7
+; CI-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT:    v_add_f32_e32 v10, v7, v5
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; CI-NEXT:    v_add_i32_e32 v6, vcc, -12, v6
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v6
+; CI-NEXT:    v_ldexp_f32_e64 v7, v7, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB11_5
+; CI-NEXT:  ; %bb.6: ; %Flow54
+; CI-NEXT:    v_mov_b32_e32 v7, v9
+; CI-NEXT:  .LBB11_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v6, vcc, -11, v6
+; CI-NEXT:    v_ldexp_f32_e32 v6, v7, v6
+; CI-NEXT:    v_mul_f32_e32 v7, v6, v8
+; CI-NEXT:    v_rndne_f32_e32 v7, v7
+; CI-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT:    v_add_f32_e32 v5, v6, v5
+; CI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v4, v5, v4
+; CI-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
+; CI-NEXT:    v_xor_b32_e32 v4, v5, v4
+; CI-NEXT:  .LBB11_8:
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v3|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB11_10
+; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v5, s0, 0, v1
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
+; CI-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; CI-NEXT:    s_cbranch_execz .LBB11_11
+; CI-NEXT:    s_branch .LBB11_16
+; CI-NEXT:  .LBB11_10:
+; CI-NEXT:    ; implicit-def: $vgpr5
+; CI-NEXT:  .LBB11_11: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f32_e64 v6, |v3|
+; CI-NEXT:    v_ldexp_f32_e64 v6, v6, 1
+; CI-NEXT:    v_div_scale_f32 v12, s[0:1], v6, v6, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v3
+; CI-NEXT:    v_ldexp_f32_e64 v8, v5, 12
+; CI-NEXT:    v_add_i32_e32 v5, vcc, -1, v11
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v1
+; CI-NEXT:    v_not_b32_e32 v7, v5
+; CI-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v6, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v13, v12
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
-; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
-; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
-; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
-; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
-; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; CI-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; CI-NEXT:    v_fma_f32 v13, v14, v13, v13
+; CI-NEXT:    v_mul_f32_e32 v14, v9, v13
+; CI-NEXT:    v_fma_f32 v15, -v12, v14, v9
+; CI-NEXT:    v_fma_f32 v14, v15, v13, v14
+; CI-NEXT:    v_fma_f32 v9, -v12, v14, v9
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
-; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v0, -v3, v2, v0
-; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    v_div_fmas_f32 v9, v9, v13, v14
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v7
+; CI-NEXT:    v_div_fixup_f32 v9, v9, v6, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB11_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_sub_i32_e32 v7, vcc, v10, v11
+; CI-NEXT:    v_add_i32_e32 v7, vcc, 12, v7
+; CI-NEXT:  .LBB11_13: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v10, v8
+; CI-NEXT:    v_mul_f32_e32 v8, v10, v9
+; CI-NEXT:    v_rndne_f32_e32 v8, v8
+; CI-NEXT:    v_fma_f32 v8, -v8, v6, v10
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v8
+; CI-NEXT:    v_add_f32_e32 v11, v8, v6
+; CI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; CI-NEXT:    v_add_i32_e32 v7, vcc, -12, v7
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v7
+; CI-NEXT:    v_ldexp_f32_e64 v8, v8, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB11_13
+; CI-NEXT:  ; %bb.14: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v8, v10
+; CI-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT:    v_add_i32_e32 v7, vcc, -11, v7
+; CI-NEXT:    v_ldexp_f32_e32 v7, v8, v7
+; CI-NEXT:    v_mul_f32_e32 v8, v7, v9
+; CI-NEXT:    v_rndne_f32_e32 v8, v8
+; CI-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT:    v_add_f32_e32 v6, v7, v6
+; CI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v5, v6, v5
+; CI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; CI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; CI-NEXT:  .LBB11_16: ; %Flow53
+; CI-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v6
+; CI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v2
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v3, v6
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v3
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_u32 s0, s4, 32
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_addc_u32 s1, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_add_u32 s0, s0, 32
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s11
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
-; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
-; VI-NEXT:    v_rcp_f32_e32 v8, v7
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v2|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB11_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    v_bfi_b32 v4, s0, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
+; VI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; VI-NEXT:    s_cbranch_execz .LBB11_3
+; VI-NEXT:    s_branch .LBB11_8
+; VI-NEXT:  .LBB11_2:
+; VI-NEXT:    ; implicit-def: $vgpr4
+; VI-NEXT:  .LBB11_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v5, |v2|
+; VI-NEXT:    v_ldexp_f32 v5, v5, 1
+; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v4, |v0|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v2
+; VI-NEXT:    v_ldexp_f32 v7, v4, 12
+; VI-NEXT:    v_add_u32_e32 v4, vcc, -1, v10
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v0
+; VI-NEXT:    v_not_b32_e32 v6, v4
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; VI-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v12, v11
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
-; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
-; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
-; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
-; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
-; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
+; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
+; VI-NEXT:    v_mul_f32_e32 v13, v8, v12
+; VI-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
+; VI-NEXT:    v_fma_f32 v8, -v11, v13, v8
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
-; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
-; VI-NEXT:    v_trunc_f32_e32 v6, v6
-; VI-NEXT:    v_fma_f32 v3, -v6, v5, v3
-; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
-; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
-; VI-NEXT:    v_rcp_f32_e32 v7, v6
+; VI-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v6
+; VI-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB11_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v9, v10
+; VI-NEXT:    v_add_u32_e32 v6, vcc, 12, v6
+; VI-NEXT:  .LBB11_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    v_mul_f32_e32 v7, v9, v8
+; VI-NEXT:    v_rndne_f32_e32 v7, v7
+; VI-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT:    v_add_f32_e32 v10, v7, v5
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; VI-NEXT:    v_add_u32_e32 v6, vcc, -12, v6
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v6
+; VI-NEXT:    v_ldexp_f32 v7, v7, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB11_5
+; VI-NEXT:  ; %bb.6: ; %Flow54
+; VI-NEXT:    v_mov_b32_e32 v7, v9
+; VI-NEXT:  .LBB11_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v6, vcc, -11, v6
+; VI-NEXT:    v_ldexp_f32 v6, v7, v6
+; VI-NEXT:    v_mul_f32_e32 v7, v6, v8
+; VI-NEXT:    v_rndne_f32_e32 v7, v7
+; VI-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT:    v_add_f32_e32 v5, v6, v5
+; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; VI-NEXT:    v_ldexp_f32 v4, v5, v4
+; VI-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
+; VI-NEXT:    v_xor_b32_e32 v4, v5, v4
+; VI-NEXT:  .LBB11_8:
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v3|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB11_10
+; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    v_bfi_b32 v5, s0, 0, v1
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
+; VI-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; VI-NEXT:    s_cbranch_execz .LBB11_11
+; VI-NEXT:    s_branch .LBB11_16
+; VI-NEXT:  .LBB11_10:
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:  .LBB11_11: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f32_e64 v6, |v3|
+; VI-NEXT:    v_ldexp_f32 v6, v6, 1
+; VI-NEXT:    v_div_scale_f32 v12, s[0:1], v6, v6, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v3
+; VI-NEXT:    v_ldexp_f32 v8, v5, 12
+; VI-NEXT:    v_add_u32_e32 v5, vcc, -1, v11
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v1
+; VI-NEXT:    v_not_b32_e32 v7, v5
+; VI-NEXT:    v_add_u32_e32 v7, vcc, v7, v10
+; VI-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v6, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v13, v12
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
-; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
-; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
-; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
-; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; VI-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; VI-NEXT:    v_fma_f32 v13, v14, v13, v13
+; VI-NEXT:    v_mul_f32_e32 v14, v9, v13
+; VI-NEXT:    v_fma_f32 v15, -v12, v14, v9
+; VI-NEXT:    v_fma_f32 v14, v15, v13, v14
+; VI-NEXT:    v_fma_f32 v9, -v12, v14, v9
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
-; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
-; VI-NEXT:    v_trunc_f32_e32 v5, v5
-; VI-NEXT:    v_fma_f32 v2, -v5, v4, v2
-; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    v_div_fmas_f32 v9, v9, v13, v14
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v7
+; VI-NEXT:    v_div_fixup_f32 v9, v9, v6, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB11_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_sub_u32_e32 v7, vcc, v10, v11
+; VI-NEXT:    v_add_u32_e32 v7, vcc, 12, v7
+; VI-NEXT:  .LBB11_13: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v10, v8
+; VI-NEXT:    v_mul_f32_e32 v8, v10, v9
+; VI-NEXT:    v_rndne_f32_e32 v8, v8
+; VI-NEXT:    v_fma_f32 v8, -v8, v6, v10
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v8
+; VI-NEXT:    v_add_f32_e32 v11, v8, v6
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, -12, v7
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v7
+; VI-NEXT:    v_ldexp_f32 v8, v8, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB11_13
+; VI-NEXT:  ; %bb.14: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v8, v10
+; VI-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT:    v_add_u32_e32 v7, vcc, -11, v7
+; VI-NEXT:    v_ldexp_f32 v7, v8, v7
+; VI-NEXT:    v_mul_f32_e32 v8, v7, v9
+; VI-NEXT:    v_rndne_f32_e32 v8, v8
+; VI-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT:    v_add_f32_e32 v6, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; VI-NEXT:    v_ldexp_f32 v5, v6, v5
+; VI-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; VI-NEXT:    v_xor_b32_e32 v5, v6, v5
+; VI-NEXT:  .LBB11_16: ; %Flow53
+; VI-NEXT:    v_mov_b32_e32 v8, 0x3fc
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v8
+; VI-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v2
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v3, v8
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v4, vcc
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v3
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v6, s8
+; VI-NEXT:    v_mov_b32_e32 v7, s9
+; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
+; VI-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[10:11]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_div_scale_f32 v6, s[2:3], v3, v3, v1
-; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v2|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB11_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v4, s0, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB11_3
+; GFX9-NEXT:    s_branch .LBB11_8
+; GFX9-NEXT:  .LBB11_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4
+; GFX9-NEXT:  .LBB11_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v5, |v2|
+; GFX9-NEXT:    v_ldexp_f32 v5, v5, 1
+; GFX9-NEXT:    v_div_scale_f32 v11, s[0:1], v5, v5, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v5, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v4, |v0|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v9, v0
+; GFX9-NEXT:    v_ldexp_f32 v7, v4, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v10, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, -1, v10
+; GFX9-NEXT:    v_not_b32_e32 v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v9
+; GFX9-NEXT:    v_rcp_f32_e32 v12, v11
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; GFX9-NEXT:    v_fma_f32 v7, v8, v7, v7
-; GFX9-NEXT:    v_mul_f32_e32 v8, v5, v7
-; GFX9-NEXT:    v_fma_f32 v9, -v6, v8, v5
-; GFX9-NEXT:    v_fma_f32 v8, v9, v7, v8
-; GFX9-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; GFX9-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; GFX9-NEXT:    v_fma_f32 v12, v13, v12, v12
+; GFX9-NEXT:    v_mul_f32_e32 v13, v8, v12
+; GFX9-NEXT:    v_fma_f32 v14, -v11, v13, v8
+; GFX9-NEXT:    v_fma_f32 v13, v14, v12, v13
+; GFX9-NEXT:    v_fma_f32 v8, -v11, v13, v8
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
-; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_fma_f32 v1, -v5, v3, v1
-; GFX9-NEXT:    v_div_scale_f32 v5, s[2:3], v2, v2, v0
-; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
-; GFX9-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX9-NEXT:    v_div_fmas_f32 v8, v8, v12, v13
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v6
+; GFX9-NEXT:    v_div_fixup_f32 v8, v8, v5, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB11_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v6, v9, v10
+; GFX9-NEXT:    v_add_u32_e32 v6, 12, v6
+; GFX9-NEXT:  .LBB11_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v8
+; GFX9-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX9-NEXT:    v_fma_f32 v7, -v7, v5, v9
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_add_f32_e32 v10, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, -12, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v6
+; GFX9-NEXT:    v_ldexp_f32 v7, v7, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB11_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow54
+; GFX9-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-NEXT:  .LBB11_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v6, -11, v6
+; GFX9-NEXT:    v_ldexp_f32 v6, v7, v6
+; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v8
+; GFX9-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX9-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX9-NEXT:  .LBB11_8:
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v3|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB11_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else16
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v5, s0, 0, v1
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB11_11
+; GFX9-NEXT:    s_branch .LBB11_16
+; GFX9-NEXT:  .LBB11_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr5
+; GFX9-NEXT:  .LBB11_11: ; %frem.compute15
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v6, |v3|
+; GFX9-NEXT:    v_ldexp_f32 v6, v6, 1
+; GFX9-NEXT:    v_div_scale_f32 v12, s[0:1], v6, v6, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v9, vcc, 1.0, v6, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v10, v1
+; GFX9-NEXT:    v_ldexp_f32 v8, v5, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v11, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, -1, v11
+; GFX9-NEXT:    v_not_b32_e32 v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v10
+; GFX9-NEXT:    v_rcp_f32_e32 v13, v12
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; GFX9-NEXT:    v_fma_f32 v6, v7, v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v7, v3, v6
-; GFX9-NEXT:    v_fma_f32 v8, -v5, v7, v3
-; GFX9-NEXT:    v_fma_f32 v7, v8, v6, v7
-; GFX9-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX9-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; GFX9-NEXT:    v_fma_f32 v13, v14, v13, v13
+; GFX9-NEXT:    v_mul_f32_e32 v14, v9, v13
+; GFX9-NEXT:    v_fma_f32 v15, -v12, v14, v9
+; GFX9-NEXT:    v_fma_f32 v14, v15, v13, v14
+; GFX9-NEXT:    v_fma_f32 v9, -v12, v14, v9
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
-; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_fma_f32 v0, -v3, v2, v0
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v13, v14
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v7
+; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v6, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB11_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v7, v10, v11
+; GFX9-NEXT:    v_add_u32_e32 v7, 12, v7
+; GFX9-NEXT:  .LBB11_13: ; %frem.loop_body23
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v10, v9
+; GFX9-NEXT:    v_rndne_f32_e32 v8, v8
+; GFX9-NEXT:    v_fma_f32 v8, -v8, v6, v10
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_add_f32_e32 v11, v8, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, -12, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v7
+; GFX9-NEXT:    v_ldexp_f32 v8, v8, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB11_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; GFX9-NEXT:    v_add_u32_e32 v7, -11, v7
+; GFX9-NEXT:    v_ldexp_f32 v7, v8, v7
+; GFX9-NEXT:    v_mul_f32_e32 v8, v7, v9
+; GFX9-NEXT:    v_rndne_f32_e32 v8, v8
+; GFX9-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, v6, v5
+; GFX9-NEXT:  .LBB11_16: ; %Flow53
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v2
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v3, v6
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_v2f32:
@@ -3111,37 +11247,160 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7] offset:32
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v6, s2, v3, v3, v1
-; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v2|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB11_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB11_3
+; GFX10-NEXT:    s_branch .LBB11_8
+; GFX10-NEXT:  .LBB11_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:  .LBB11_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v5, |v2|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v4, |v0|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX10-NEXT:    v_ldexp_f32 v5, v5, 1
+; GFX10-NEXT:    v_ldexp_f32 v6, v4, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX10-NEXT:    v_div_scale_f32 v9, s4, v5, v5, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX10-NEXT:    v_not_b32_e32 v8, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
-; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
-; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
-; GFX10-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
-; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_fma_f32 v1, -v5, v3, v1
-; GFX10-NEXT:    v_div_scale_f32 v5, s2, v2, v2, v0
-; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v8
+; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB11_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB11_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v6
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX10-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX10-NEXT:    v_fma_f32 v6, -v6, v5, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB11_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow54
+; GFX10-NEXT:    v_mov_b32_e32 v8, s2
+; GFX10-NEXT:    v_mov_b32_e32 v6, v9
+; GFX10-NEXT:  .LBB11_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, -11, v8
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX10-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX10-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX10-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX10-NEXT:  .LBB11_8:
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v1|, |v3|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB11_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else16
+; GFX10-NEXT:    v_bfi_b32 v5, 0x7fffffff, 0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB11_11
+; GFX10-NEXT:    s_branch .LBB11_16
+; GFX10-NEXT:  .LBB11_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr5
+; GFX10-NEXT:  .LBB11_11: ; %frem.compute15
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v6, |v3|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; GFX10-NEXT:    v_ldexp_f32 v6, v6, 1
+; GFX10-NEXT:    v_ldexp_f32 v7, v5, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v5, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX10-NEXT:    v_div_scale_f32 v10, s4, v6, v6, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, -1, v5
+; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX10-NEXT:    v_not_b32_e32 v9, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v9, v8
+; GFX10-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v6
-; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
-; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v6
-; GFX10-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v11
+; GFX10-NEXT:    v_mul_f32_e32 v12, v8, v11
+; GFX10-NEXT:    v_fma_f32 v13, -v10, v12, v8
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v11
+; GFX10-NEXT:    v_fma_f32 v8, -v10, v12, v8
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
-; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_f32 v0, -v3, v2, v0
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT:    v_div_fmas_f32 v8, v8, v11, v12
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v9
+; GFX10-NEXT:    v_div_fixup_f32 v8, v8, v6, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB11_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB11_13: ; %frem.loop_body23
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v7
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v7, v10, v8
+; GFX10-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX10-NEXT:    v_fma_f32 v7, -v7, v6, v10
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_add_f32_e32 v9, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v7, v7, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB11_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v9, s2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v10
+; GFX10-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -11, v9
+; GFX10-NEXT:    v_ldexp_f32 v7, v7, v9
+; GFX10-NEXT:    v_mul_f32_e32 v8, v7, v8
+; GFX10-NEXT:    v_rndne_f32_e32 v8, v8
+; GFX10-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; GFX10-NEXT:    v_xor_b32_e32 v5, v6, v5
+; GFX10-NEXT:  .LBB11_16: ; %Flow53
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v2, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v1, 0x1f8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v3, 0x3fc
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_v2f32:
@@ -3149,55 +11408,209 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5] offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
-; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v2|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB11_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB11_3
+; GFX11-NEXT:    s_branch .LBB11_8
+; GFX11-NEXT:  .LBB11_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:  .LBB11_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v5, |v2|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v4, |v0|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v7, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v5, v5, 1
+; GFX11-NEXT:    v_ldexp_f32 v6, v4, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v4, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX11-NEXT:    v_div_scale_f32 v9, null, v5, v5, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX11-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v8, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX11-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v8, v5, v7
-; GFX11-NEXT:    v_fma_f32 v9, -v6, v8, v5
+; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_mul_f32_e32 v11, v7, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v7
-; GFX11-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; GFX11-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v7, -v9, v11, v7
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB11_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB11_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v9, v6
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX11-NEXT:    v_rndne_f32_e32 v6, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
-; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
+; GFX11-NEXT:    v_fma_f32 v6, -v6, v5, v9
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v6, v6, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB11_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow54
+; GFX11-NEXT:    v_mov_b32_e32 v8, s2
+; GFX11-NEXT:    v_mov_b32_e32 v6, v9
+; GFX11-NEXT:  .LBB11_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, -11, v8
+; GFX11-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX11-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v4, v5, v4
+; GFX11-NEXT:  .LBB11_8:
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v1|, |v3|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB11_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else16
+; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, 0, v1
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB11_11
+; GFX11-NEXT:    s_branch .LBB11_16
+; GFX11-NEXT:  .LBB11_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr5
+; GFX11-NEXT:  .LBB11_11: ; %frem.compute15
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v6, |v3|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v6, v6, 1
+; GFX11-NEXT:    v_ldexp_f32 v7, v5, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX11-NEXT:    v_div_scale_f32 v10, null, v6, v6, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, -1, v5
+; GFX11-NEXT:    v_rcp_f32_e32 v11, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    v_fma_f32 v1, -v5, v3, v1
-; GFX11-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
-; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX11-NEXT:    v_not_b32_e32 v9, v5
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v8
+; GFX11-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v6
+; GFX11-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v7, v3, v6
-; GFX11-NEXT:    v_fma_f32 v8, -v5, v7, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_mul_f32_e32 v12, v8, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v6
-; GFX11-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX11-NEXT:    v_fma_f32 v13, -v10, v12, v8
+; GFX11-NEXT:    v_fmac_f32_e32 v12, v13, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v8, -v10, v12, v8
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v8, v8, v11, v12
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v8, v8, v6, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB11_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB11_13: ; %frem.loop_body23
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v10, v7
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v7, v10, v8
+; GFX11-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v7, -v7, v6, v10
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_add_f32_e32 v9, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v7, v7, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB11_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v9, s2
+; GFX11-NEXT:    v_mov_b32_e32 v7, v10
+; GFX11-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -11, v9
+; GFX11-NEXT:    v_ldexp_f32 v7, v7, v9
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
-; GFX11-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
+; GFX11-NEXT:    v_mul_f32_e32 v8, v7, v8
+; GFX11-NEXT:    v_rndne_f32_e32 v8, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX11-NEXT:    v_fma_f32 v0, -v3, v2, v0
-; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 0x80000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v5, v6, v5
+; GFX11-NEXT:  .LBB11_16: ; %Flow53
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v2, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v1, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v3, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v5
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f32:
@@ -3205,57 +11618,220 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v2, s[4:5] offset:32
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
-; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT:    s_and_b32 s5, s3, 0x7fffffff
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, s5, v2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB11_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v3, 0x7fffffff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, s5, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v0, v3, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB11_3
+; GFX1150-NEXT:    s_branch .LBB11_8
+; GFX1150-NEXT:  .LBB11_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB11_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v3, |s3|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |v0|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v5, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    v_ldexp_f32 v4, v2, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v6, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
 ; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_mul_f32_e32 v8, v5, v7
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v5
-; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX1150-NEXT:    v_fma_f32 v5, -v7, v9, v5
 ; GFX1150-NEXT:    s_denorm_mode 12
-; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB11_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s4, s4, s5
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s4, s4, 12
+; GFX1150-NEXT:  .LBB11_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1150-NEXT:    s_add_i32 s4, s4, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s4, 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
-; GFX1150-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB11_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow54
+; GFX1150-NEXT:    v_mov_b32_e32 v6, s4
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-NEXT:  .LBB11_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -11, v6
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX1150-NEXT:    v_fma_f32 v1, v5, v3, v1
-; GFX1150-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
-; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x80000000, v0
+; GFX1150-NEXT:    v_xor_b32_e32 v2, v3, v2
+; GFX1150-NEXT:  .LBB11_8:
+; GFX1150-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v1
+; GFX1150-NEXT:    s_and_b32 s5, s2, 0x7fffffff
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, s5, v3
+; GFX1150-NEXT:    s_cbranch_vccz .LBB11_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else16
+; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v1
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, s5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB11_11
+; GFX1150-NEXT:    s_branch .LBB11_16
+; GFX1150-NEXT:  .LBB11_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr3
+; GFX1150-NEXT:  .LBB11_11: ; %frem.compute15
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v4, |s2|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v3, |v1|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 1
+; GFX1150-NEXT:    v_ldexp_f32 v5, v3, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX1150-NEXT:    v_div_scale_f32 v8, null, v4, v4, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1150-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX1150-NEXT:    v_rcp_f32_e32 v9, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v7, v3
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX1150-NEXT:    v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0
 ; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v9
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v3, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v10, v6, v9
+; GFX1150-NEXT:    v_fma_f32 v11, -v8, v10, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v8, -v5, v7, v3
-; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v9
+; GFX1150-NEXT:    v_fma_f32 v6, -v8, v10, v6
 ; GFX1150-NEXT:    s_denorm_mode 12
-; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
+; GFX1150-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB11_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT:    s_sub_i32 s4, s4, s5
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s4, s4, 12
+; GFX1150-NEXT:  .LBB11_13: ; %frem.loop_body23
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1150-NEXT:    s_add_i32 s4, s4, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s4, 12
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
-; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v8, v6
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX1150-NEXT:    v_fmac_f32_e32 v0, v3, v2
-; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fma_f32 v5, v5, v4, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v7, v5, v4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB11_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v7, s4
+; GFX1150-NEXT:    v_mov_b32_e32 v5, v8
+; GFX1150-NEXT:  .LBB11_15: ; %frem.loop_exit24
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, -11, v7
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX1150-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v4, v3
+; GFX1150-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX1150-NEXT:    v_xor_b32_e32 v3, v4, v3
+; GFX1150-NEXT:  .LBB11_16: ; %Flow53
+; GFX1150-NEXT:    s_cmp_neq_f32 s3, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s3, s3, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s5, v0, 0x1f8
+; GFX1150-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s3, s3, s5
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s3, s4
+; GFX1150-NEXT:    s_cmp_neq_f32 s2, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s4, v1, 0x1f8
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX1150-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s4
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, s3
+; GFX1150-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v3
+; GFX1150-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
@@ -3270,314 +11846,1319 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-LABEL: frem_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s8
-; SI-NEXT:    s_mov_b32 s1, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s2
-; SI-NEXT:    s_mov_b32 s11, s3
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_mov_b32 s7, s3
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
-; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
-; SI-NEXT:    v_rcp_f32_e32 v10, v9
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v4|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB12_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
+; SI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB12_3
+; SI-NEXT:    s_branch .LBB12_8
+; SI-NEXT:  .LBB12_2:
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB12_3: ; %frem.compute
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v0
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v8
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v8, |v0|
+; SI-NEXT:    v_cndmask_b32_e64 v8, |v0|, v8, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v9, v8, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v4|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v8, |v4|
+; SI-NEXT:    v_cndmask_b32_e64 v8, |v4|, v8, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v10
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v8, v8, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v10, vcc, 1.0, v8, 1.0
+; SI-NEXT:    v_div_scale_f32 v11, s[4:5], v8, v8, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v12, v11
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
-; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
-; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
-; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
-; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
+; SI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; SI-NEXT:    v_fma_f32 v12, v13, v12, v12
+; SI-NEXT:    v_mul_f32_e32 v13, v10, v12
+; SI-NEXT:    v_fma_f32 v14, -v11, v13, v10
+; SI-NEXT:    v_fma_f32 v13, v14, v12, v13
+; SI-NEXT:    v_fma_f32 v10, -v11, v13, v10
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
-; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
-; SI-NEXT:    v_trunc_f32_e32 v8, v8
-; SI-NEXT:    v_fma_f32 v3, -v8, v7, v3
-; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
-; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
-; SI-NEXT:    v_rcp_f32_e32 v9, v8
+; SI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
+; SI-NEXT:    v_div_fixup_f32 v10, v10, v8, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB12_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB12_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v11, v9
+; SI-NEXT:    v_mul_f32_e32 v9, v11, v10
+; SI-NEXT:    v_rndne_f32_e32 v9, v9
+; SI-NEXT:    v_fma_f32 v9, -v9, v8, v11
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; SI-NEXT:    v_add_f32_e32 v12, v9, v8
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB12_5
+; SI-NEXT:  ; %bb.6: ; %Flow134
+; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:  .LBB12_7: ; %frem.loop_exit
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, s1
+; SI-NEXT:    v_mul_f32_e32 v10, v9, v10
+; SI-NEXT:    v_rndne_f32_e32 v10, v10
+; SI-NEXT:    v_fma_f32 v9, -v10, v8, v9
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v9
+; SI-NEXT:    v_add_f32_e32 v8, v9, v8
+; SI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v8, v8, s0
+; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v0
+; SI-NEXT:    v_xor_b32_e32 v8, v9, v8
+; SI-NEXT:  .LBB12_8:
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v5|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB12_10
+; SI-NEXT:  ; %bb.9: ; %frem.else16
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v9, s0, 0, v1
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
+; SI-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB12_11
+; SI-NEXT:    s_branch .LBB12_16
+; SI-NEXT:  .LBB12_10:
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB12_11: ; %frem.compute15
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v9, v1
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v9
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v9, |v1|
+; SI-NEXT:    v_cndmask_b32_e64 v9, |v1|, v9, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v10, v9, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v5|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v9, |v5|
+; SI-NEXT:    v_cndmask_b32_e64 v9, |v5|, v9, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v5
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v11
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v11, vcc, 1.0, v9, 1.0
+; SI-NEXT:    v_div_scale_f32 v12, s[4:5], v9, v9, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v13, v12
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
-; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
-; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
-; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
-; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
-; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
+; SI-NEXT:    v_fma_f32 v14, -v12, v13, 1.0
+; SI-NEXT:    v_fma_f32 v13, v14, v13, v13
+; SI-NEXT:    v_mul_f32_e32 v14, v11, v13
+; SI-NEXT:    v_fma_f32 v15, -v12, v14, v11
+; SI-NEXT:    v_fma_f32 v14, v15, v13, v14
+; SI-NEXT:    v_fma_f32 v11, -v12, v14, v11
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
-; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; SI-NEXT:    v_trunc_f32_e32 v7, v7
-; SI-NEXT:    v_fma_f32 v2, -v7, v6, v2
-; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
-; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
-; SI-NEXT:    v_rcp_f32_e32 v8, v7
+; SI-NEXT:    v_div_fmas_f32 v11, v11, v13, v14
+; SI-NEXT:    v_div_fixup_f32 v11, v11, v9, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB12_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB12_13: ; %frem.loop_body23
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v12, v10
+; SI-NEXT:    v_mul_f32_e32 v10, v12, v11
+; SI-NEXT:    v_rndne_f32_e32 v10, v10
+; SI-NEXT:    v_fma_f32 v10, -v10, v9, v12
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; SI-NEXT:    v_add_f32_e32 v13, v10, v9
+; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB12_13
+; SI-NEXT:  ; %bb.14: ; %Flow130
+; SI-NEXT:    v_mov_b32_e32 v10, v12
+; SI-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, s1
+; SI-NEXT:    v_mul_f32_e32 v11, v10, v11
+; SI-NEXT:    v_rndne_f32_e32 v11, v11
+; SI-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; SI-NEXT:    v_add_f32_e32 v9, v10, v9
+; SI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v9, v9, s0
+; SI-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; SI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; SI-NEXT:  .LBB12_16:
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v2|, |v6|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB12_18
+; SI-NEXT:  ; %bb.17: ; %frem.else50
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v10, s0, 0, v2
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
+; SI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB12_19
+; SI-NEXT:    s_branch .LBB12_24
+; SI-NEXT:  .LBB12_18:
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB12_19: ; %frem.compute49
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v10, v2
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v10
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v10, |v2|
+; SI-NEXT:    v_cndmask_b32_e64 v10, |v2|, v10, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v11, v10, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v10, |v6|
+; SI-NEXT:    v_cndmask_b32_e64 v10, |v6|, v10, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v12, v6
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v12
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v10, 1.0
+; SI-NEXT:    v_div_scale_f32 v13, s[4:5], v10, v10, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v14, v13
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
-; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
-; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
-; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
-; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
-; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
+; SI-NEXT:    v_fma_f32 v15, -v13, v14, 1.0
+; SI-NEXT:    v_fma_f32 v14, v15, v14, v14
+; SI-NEXT:    v_mul_f32_e32 v15, v12, v14
+; SI-NEXT:    v_fma_f32 v16, -v13, v15, v12
+; SI-NEXT:    v_fma_f32 v15, v16, v14, v15
+; SI-NEXT:    v_fma_f32 v12, -v13, v15, v12
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
-; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
-; SI-NEXT:    v_trunc_f32_e32 v6, v6
-; SI-NEXT:    v_fma_f32 v1, -v6, v5, v1
-; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
-; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
-; SI-NEXT:    v_rcp_f32_e32 v7, v6
+; SI-NEXT:    v_div_fmas_f32 v12, v12, v14, v15
+; SI-NEXT:    v_div_fixup_f32 v12, v12, v10, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB12_23
+; SI-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB12_21: ; %frem.loop_body57
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v13, v11
+; SI-NEXT:    v_mul_f32_e32 v11, v13, v12
+; SI-NEXT:    v_rndne_f32_e32 v11, v11
+; SI-NEXT:    v_fma_f32 v11, -v11, v10, v13
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; SI-NEXT:    v_add_f32_e32 v14, v11, v10
+; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB12_21
+; SI-NEXT:  ; %bb.22: ; %Flow126
+; SI-NEXT:    v_mov_b32_e32 v11, v13
+; SI-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, s1
+; SI-NEXT:    v_mul_f32_e32 v12, v11, v12
+; SI-NEXT:    v_rndne_f32_e32 v12, v12
+; SI-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; SI-NEXT:    v_add_f32_e32 v10, v11, v10
+; SI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v10, v10, s0
+; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v2
+; SI-NEXT:    v_xor_b32_e32 v10, v11, v10
+; SI-NEXT:  .LBB12_24:
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v3|, |v7|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB12_26
+; SI-NEXT:  ; %bb.25: ; %frem.else84
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_bfi_b32 v11, s0, 0, v3
+; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
+; SI-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB12_27
+; SI-NEXT:    s_branch .LBB12_32
+; SI-NEXT:  .LBB12_26:
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB12_27: ; %frem.compute83
+; SI-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v3|, s4
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v11, v3
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v11
+; SI-NEXT:    s_cselect_b32 s2, s2, 0
+; SI-NEXT:    v_frexp_mant_f32_e64 v11, |v3|
+; SI-NEXT:    v_cndmask_b32_e64 v11, |v3|, v11, s[0:1]
+; SI-NEXT:    v_ldexp_f32_e64 v12, v11, 12
+; SI-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v7|, s4
+; SI-NEXT:    v_frexp_mant_f32_e64 v11, |v7|
+; SI-NEXT:    v_cndmask_b32_e64 v11, |v7|, v11, s[0:1]
+; SI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v7
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v13
+; SI-NEXT:    s_cselect_b32 s3, s0, 0
+; SI-NEXT:    s_add_i32 s0, s3, -1
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, 1
+; SI-NEXT:    s_not_b32 s1, s0
+; SI-NEXT:    s_add_i32 s1, s1, s2
+; SI-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v11, 1.0
+; SI-NEXT:    v_div_scale_f32 v14, s[4:5], v11, v11, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v15, v14
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
-; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
-; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
-; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
-; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; SI-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
+; SI-NEXT:    v_fma_f32 v15, v16, v15, v15
+; SI-NEXT:    v_mul_f32_e32 v16, v13, v15
+; SI-NEXT:    v_fma_f32 v17, -v14, v16, v13
+; SI-NEXT:    v_fma_f32 v16, v17, v15, v16
+; SI-NEXT:    v_fma_f32 v13, -v14, v16, v13
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
-; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
-; SI-NEXT:    v_trunc_f32_e32 v5, v5
-; SI-NEXT:    v_fma_f32 v0, -v5, v4, v0
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    v_div_fmas_f32 v13, v13, v15, v16
+; SI-NEXT:    v_div_fixup_f32 v13, v13, v11, 1.0
+; SI-NEXT:    s_cmp_lt_i32 s1, 13
+; SI-NEXT:    s_cbranch_scc1 .LBB12_31
+; SI-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; SI-NEXT:    s_sub_i32 s1, s2, s3
+; SI-NEXT:    s_add_i32 s1, s1, 12
+; SI-NEXT:  .LBB12_29: ; %frem.loop_body91
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v14, v12
+; SI-NEXT:    v_mul_f32_e32 v12, v14, v13
+; SI-NEXT:    v_rndne_f32_e32 v12, v12
+; SI-NEXT:    v_fma_f32 v12, -v12, v11, v14
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; SI-NEXT:    v_add_f32_e32 v15, v12, v11
+; SI-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v12, v12, 12
+; SI-NEXT:    s_add_i32 s1, s1, -12
+; SI-NEXT:    s_cmp_gt_i32 s1, 12
+; SI-NEXT:    s_cbranch_scc1 .LBB12_29
+; SI-NEXT:  ; %bb.30: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v12, v14
+; SI-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; SI-NEXT:    s_add_i32 s1, s1, -11
+; SI-NEXT:    v_ldexp_f32_e64 v12, v12, s1
+; SI-NEXT:    v_mul_f32_e32 v13, v12, v13
+; SI-NEXT:    v_rndne_f32_e32 v13, v13
+; SI-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; SI-NEXT:    v_add_f32_e32 v11, v12, v11
+; SI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; SI-NEXT:    v_ldexp_f32_e64 v11, v11, s0
+; SI-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; SI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; SI-NEXT:  .LBB12_32: ; %Flow125
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v4
+; SI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v4, v12
+; SI-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v13, 0x7fc00000
+; SI-NEXT:    v_cndmask_b32_e32 v0, v13, v8, vcc
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v5
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v5, v12
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v6
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v6, v12
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v2, v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v13, v10, vcc
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v7
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v7, v12
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v3, v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v13, v11, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_v4f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s6, s2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s0, s8
-; CI-NEXT:    s_mov_b32 s1, s9
-; CI-NEXT:    s_mov_b32 s8, s10
-; CI-NEXT:    s_mov_b32 s9, s11
-; CI-NEXT:    s_mov_b32 s10, s2
-; CI-NEXT:    s_mov_b32 s11, s3
-; CI-NEXT:    s_mov_b32 s7, s3
-; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
-; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
-; CI-NEXT:    v_rcp_f32_e32 v10, v9
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v4|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB12_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
+; CI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_3
+; CI-NEXT:    s_branch .LBB12_8
+; CI-NEXT:  .LBB12_2:
+; CI-NEXT:    ; implicit-def: $vgpr8
+; CI-NEXT:  .LBB12_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f32_e64 v9, |v4|
+; CI-NEXT:    v_ldexp_f32_e64 v9, v9, 1
+; CI-NEXT:    v_div_scale_f32 v15, s[0:1], v9, v9, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v8, |v0|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v4
+; CI-NEXT:    v_ldexp_f32_e64 v11, v8, 12
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v14
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v0
+; CI-NEXT:    v_not_b32_e32 v10, v8
+; CI-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CI-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v9, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v16, v15
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
-; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
-; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
-; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
-; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
+; CI-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
+; CI-NEXT:    v_fma_f32 v16, v17, v16, v16
+; CI-NEXT:    v_mul_f32_e32 v17, v12, v16
+; CI-NEXT:    v_fma_f32 v18, -v15, v17, v12
+; CI-NEXT:    v_fma_f32 v17, v18, v16, v17
+; CI-NEXT:    v_fma_f32 v12, -v15, v17, v12
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
-; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
-; CI-NEXT:    v_trunc_f32_e32 v8, v8
-; CI-NEXT:    v_fma_f32 v3, -v8, v7, v3
-; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
-; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
-; CI-NEXT:    v_rcp_f32_e32 v9, v8
+; CI-NEXT:    v_div_fmas_f32 v12, v12, v16, v17
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v10
+; CI-NEXT:    v_div_fixup_f32 v12, v12, v9, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v10, vcc, v13, v14
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 12, v10
+; CI-NEXT:  .LBB12_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v13, v11
+; CI-NEXT:    v_mul_f32_e32 v11, v13, v12
+; CI-NEXT:    v_rndne_f32_e32 v11, v11
+; CI-NEXT:    v_fma_f32 v11, -v11, v9, v13
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; CI-NEXT:    v_add_f32_e32 v14, v11, v9
+; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -12, v10
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v10
+; CI-NEXT:    v_ldexp_f32_e64 v11, v11, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB12_5
+; CI-NEXT:  ; %bb.6: ; %Flow134
+; CI-NEXT:    v_mov_b32_e32 v11, v13
+; CI-NEXT:  .LBB12_7: ; %frem.loop_exit
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -11, v10
+; CI-NEXT:    v_ldexp_f32_e32 v10, v11, v10
+; CI-NEXT:    v_mul_f32_e32 v11, v10, v12
+; CI-NEXT:    v_rndne_f32_e32 v11, v11
+; CI-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; CI-NEXT:    v_add_f32_e32 v9, v10, v9
+; CI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v8, v9, v8
+; CI-NEXT:    v_and_b32_e32 v9, 0x80000000, v0
+; CI-NEXT:    v_xor_b32_e32 v8, v9, v8
+; CI-NEXT:  .LBB12_8:
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v5|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB12_10
+; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v9, s0, 0, v1
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
+; CI-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_11
+; CI-NEXT:    s_branch .LBB12_16
+; CI-NEXT:  .LBB12_10:
+; CI-NEXT:    ; implicit-def: $vgpr9
+; CI-NEXT:  .LBB12_11: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f32_e64 v10, |v5|
+; CI-NEXT:    v_ldexp_f32_e64 v10, v10, 1
+; CI-NEXT:    v_div_scale_f32 v16, s[0:1], v10, v10, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v9, |v1|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v15, v5
+; CI-NEXT:    v_ldexp_f32_e64 v12, v9, 12
+; CI-NEXT:    v_add_i32_e32 v9, vcc, -1, v15
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v1
+; CI-NEXT:    v_not_b32_e32 v11, v9
+; CI-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CI-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v10, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v17, v16
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
-; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
-; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
-; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
-; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
-; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
+; CI-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; CI-NEXT:    v_fma_f32 v17, v18, v17, v17
+; CI-NEXT:    v_mul_f32_e32 v18, v13, v17
+; CI-NEXT:    v_fma_f32 v19, -v16, v18, v13
+; CI-NEXT:    v_fma_f32 v18, v19, v17, v18
+; CI-NEXT:    v_fma_f32 v13, -v16, v18, v13
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
-; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; CI-NEXT:    v_trunc_f32_e32 v7, v7
-; CI-NEXT:    v_fma_f32 v2, -v7, v6, v2
-; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
-; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
-; CI-NEXT:    v_rcp_f32_e32 v8, v7
+; CI-NEXT:    v_div_fmas_f32 v13, v13, v17, v18
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v11
+; CI-NEXT:    v_div_fixup_f32 v13, v13, v10, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_sub_i32_e32 v11, vcc, v14, v15
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 12, v11
+; CI-NEXT:  .LBB12_13: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v14, v12
+; CI-NEXT:    v_mul_f32_e32 v12, v14, v13
+; CI-NEXT:    v_rndne_f32_e32 v12, v12
+; CI-NEXT:    v_fma_f32 v12, -v12, v10, v14
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; CI-NEXT:    v_add_f32_e32 v15, v12, v10
+; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; CI-NEXT:    v_add_i32_e32 v11, vcc, -12, v11
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v11
+; CI-NEXT:    v_ldexp_f32_e64 v12, v12, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB12_13
+; CI-NEXT:  ; %bb.14: ; %Flow130
+; CI-NEXT:    v_mov_b32_e32 v12, v14
+; CI-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT:    v_add_i32_e32 v11, vcc, -11, v11
+; CI-NEXT:    v_ldexp_f32_e32 v11, v12, v11
+; CI-NEXT:    v_mul_f32_e32 v12, v11, v13
+; CI-NEXT:    v_rndne_f32_e32 v12, v12
+; CI-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; CI-NEXT:    v_add_f32_e32 v10, v11, v10
+; CI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v9, v10, v9
+; CI-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; CI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; CI-NEXT:  .LBB12_16:
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v2|, |v6|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB12_18
+; CI-NEXT:  ; %bb.17: ; %frem.else50
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v10, s0, 0, v2
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
+; CI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_19
+; CI-NEXT:    s_branch .LBB12_24
+; CI-NEXT:  .LBB12_18:
+; CI-NEXT:    ; implicit-def: $vgpr10
+; CI-NEXT:  .LBB12_19: ; %frem.compute49
+; CI-NEXT:    v_frexp_mant_f32_e64 v11, |v6|
+; CI-NEXT:    v_ldexp_f32_e64 v11, v11, 1
+; CI-NEXT:    v_div_scale_f32 v17, s[0:1], v11, v11, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v10, |v2|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v6
+; CI-NEXT:    v_ldexp_f32_e64 v13, v10, 12
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -1, v16
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v15, v2
+; CI-NEXT:    v_not_b32_e32 v12, v10
+; CI-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CI-NEXT:    v_div_scale_f32 v14, vcc, 1.0, v11, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v18, v17
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
-; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
-; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
-; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
-; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
-; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
+; CI-NEXT:    v_fma_f32 v19, -v17, v18, 1.0
+; CI-NEXT:    v_fma_f32 v18, v19, v18, v18
+; CI-NEXT:    v_mul_f32_e32 v19, v14, v18
+; CI-NEXT:    v_fma_f32 v20, -v17, v19, v14
+; CI-NEXT:    v_fma_f32 v19, v20, v18, v19
+; CI-NEXT:    v_fma_f32 v14, -v17, v19, v14
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
-; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
-; CI-NEXT:    v_trunc_f32_e32 v6, v6
-; CI-NEXT:    v_fma_f32 v1, -v6, v5, v1
-; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
-; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
-; CI-NEXT:    v_rcp_f32_e32 v7, v6
+; CI-NEXT:    v_div_fmas_f32 v14, v14, v18, v19
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v12
+; CI-NEXT:    v_div_fixup_f32 v14, v14, v11, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_23
+; CI-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; CI-NEXT:    v_sub_i32_e32 v12, vcc, v15, v16
+; CI-NEXT:    v_add_i32_e32 v12, vcc, 12, v12
+; CI-NEXT:  .LBB12_21: ; %frem.loop_body57
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v15, v13
+; CI-NEXT:    v_mul_f32_e32 v13, v15, v14
+; CI-NEXT:    v_rndne_f32_e32 v13, v13
+; CI-NEXT:    v_fma_f32 v13, -v13, v11, v15
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; CI-NEXT:    v_add_f32_e32 v16, v13, v11
+; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; CI-NEXT:    v_add_i32_e32 v12, vcc, -12, v12
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v12
+; CI-NEXT:    v_ldexp_f32_e64 v13, v13, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB12_21
+; CI-NEXT:  ; %bb.22: ; %Flow126
+; CI-NEXT:    v_mov_b32_e32 v13, v15
+; CI-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; CI-NEXT:    v_add_i32_e32 v12, vcc, -11, v12
+; CI-NEXT:    v_ldexp_f32_e32 v12, v13, v12
+; CI-NEXT:    v_mul_f32_e32 v13, v12, v14
+; CI-NEXT:    v_rndne_f32_e32 v13, v13
+; CI-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; CI-NEXT:    v_add_f32_e32 v11, v12, v11
+; CI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v10, v11, v10
+; CI-NEXT:    v_and_b32_e32 v11, 0x80000000, v2
+; CI-NEXT:    v_xor_b32_e32 v10, v11, v10
+; CI-NEXT:  .LBB12_24:
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v3|, |v7|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB12_26
+; CI-NEXT:  ; %bb.25: ; %frem.else84
+; CI-NEXT:    s_brev_b32 s0, -2
+; CI-NEXT:    v_bfi_b32 v11, s0, 0, v3
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
+; CI-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
+; CI-NEXT:    s_cbranch_execz .LBB12_27
+; CI-NEXT:    s_branch .LBB12_32
+; CI-NEXT:  .LBB12_26:
+; CI-NEXT:    ; implicit-def: $vgpr11
+; CI-NEXT:  .LBB12_27: ; %frem.compute83
+; CI-NEXT:    v_frexp_mant_f32_e64 v12, |v7|
+; CI-NEXT:    v_ldexp_f32_e64 v12, v12, 1
+; CI-NEXT:    v_div_scale_f32 v18, s[0:1], v12, v12, 1.0
+; CI-NEXT:    v_frexp_mant_f32_e64 v11, |v3|
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v17, v7
+; CI-NEXT:    v_ldexp_f32_e64 v14, v11, 12
+; CI-NEXT:    v_add_i32_e32 v11, vcc, -1, v17
+; CI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v3
+; CI-NEXT:    v_not_b32_e32 v13, v11
+; CI-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CI-NEXT:    v_div_scale_f32 v15, vcc, 1.0, v12, 1.0
+; CI-NEXT:    v_rcp_f32_e32 v19, v18
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
-; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
-; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
-; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
-; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; CI-NEXT:    v_fma_f32 v20, -v18, v19, 1.0
+; CI-NEXT:    v_fma_f32 v19, v20, v19, v19
+; CI-NEXT:    v_mul_f32_e32 v20, v15, v19
+; CI-NEXT:    v_fma_f32 v21, -v18, v20, v15
+; CI-NEXT:    v_fma_f32 v20, v21, v19, v20
+; CI-NEXT:    v_fma_f32 v15, -v18, v20, v15
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
-; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
-; CI-NEXT:    v_trunc_f32_e32 v5, v5
-; CI-NEXT:    v_fma_f32 v0, -v5, v4, v0
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    v_div_fmas_f32 v15, v15, v19, v20
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v13
+; CI-NEXT:    v_div_fixup_f32 v15, v15, v12, 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB12_31
+; CI-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; CI-NEXT:    v_sub_i32_e32 v13, vcc, v16, v17
+; CI-NEXT:    v_add_i32_e32 v13, vcc, 12, v13
+; CI-NEXT:  .LBB12_29: ; %frem.loop_body91
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v16, v14
+; CI-NEXT:    v_mul_f32_e32 v14, v16, v15
+; CI-NEXT:    v_rndne_f32_e32 v14, v14
+; CI-NEXT:    v_fma_f32 v14, -v14, v12, v16
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v14
+; CI-NEXT:    v_add_f32_e32 v17, v14, v12
+; CI-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
+; CI-NEXT:    v_add_i32_e32 v13, vcc, -12, v13
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v13
+; CI-NEXT:    v_ldexp_f32_e64 v14, v14, 12
+; CI-NEXT:    s_cbranch_vccnz .LBB12_29
+; CI-NEXT:  ; %bb.30: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v14, v16
+; CI-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; CI-NEXT:    v_add_i32_e32 v13, vcc, -11, v13
+; CI-NEXT:    v_ldexp_f32_e32 v13, v14, v13
+; CI-NEXT:    v_mul_f32_e32 v14, v13, v15
+; CI-NEXT:    v_rndne_f32_e32 v14, v14
+; CI-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; CI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; CI-NEXT:    v_add_f32_e32 v12, v13, v12
+; CI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; CI-NEXT:    v_ldexp_f32_e32 v11, v12, v11
+; CI-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; CI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; CI-NEXT:  .LBB12_32: ; %Flow125
+; CI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v4
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v4, v12
+; CI-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v4
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_mov_b32_e32 v13, 0x7fc00000
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v5, v12
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v4
+; CI-NEXT:    v_cndmask_b32_e32 v0, v13, v8, vcc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v5
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v6, v12
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v2, v4
+; CI-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v6
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cmp_class_f32_e64 s[0:1], v7, v12
+; CI-NEXT:    v_cmp_class_f32_e64 s[2:3], v3, v4
+; CI-NEXT:    v_cndmask_b32_e32 v2, v13, v10, vcc
+; CI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v7
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    v_cndmask_b32_e32 v3, v13, v11, vcc
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v8, s0
-; VI-NEXT:    s_add_u32 s0, s4, 64
-; VI-NEXT:    v_mov_b32_e32 v9, s1
-; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_add_u32 s0, s0, 64
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s11
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
-; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
-; VI-NEXT:    v_rcp_f32_e32 v12, v11
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v4|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB12_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
+; VI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
+; VI-NEXT:    s_cbranch_execz .LBB12_3
+; VI-NEXT:    s_branch .LBB12_8
+; VI-NEXT:  .LBB12_2:
+; VI-NEXT:    ; implicit-def: $vgpr8
+; VI-NEXT:  .LBB12_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f32_e64 v9, |v4|
+; VI-NEXT:    v_ldexp_f32 v9, v9, 1
+; VI-NEXT:    v_div_scale_f32 v15, s[0:1], v9, v9, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v8, |v0|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v4
+; VI-NEXT:    v_ldexp_f32 v11, v8, 12
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v14
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v13, v0
+; VI-NEXT:    v_not_b32_e32 v10, v8
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v13
+; VI-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v9, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v16, v15
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
-; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
-; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
-; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
-; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
-; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
+; VI-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
+; VI-NEXT:    v_fma_f32 v16, v17, v16, v16
+; VI-NEXT:    v_mul_f32_e32 v17, v12, v16
+; VI-NEXT:    v_fma_f32 v18, -v15, v17, v12
+; VI-NEXT:    v_fma_f32 v17, v18, v16, v17
+; VI-NEXT:    v_fma_f32 v12, -v15, v17, v12
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
-; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
-; VI-NEXT:    v_trunc_f32_e32 v10, v10
-; VI-NEXT:    v_fma_f32 v3, -v10, v7, v3
-; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
-; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
-; VI-NEXT:    v_rcp_f32_e32 v11, v10
+; VI-NEXT:    v_div_fmas_f32 v12, v12, v16, v17
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v10
+; VI-NEXT:    v_div_fixup_f32 v12, v12, v9, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, v13, v14
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 12, v10
+; VI-NEXT:  .LBB12_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v13, v11
+; VI-NEXT:    v_mul_f32_e32 v11, v13, v12
+; VI-NEXT:    v_rndne_f32_e32 v11, v11
+; VI-NEXT:    v_fma_f32 v11, -v11, v9, v13
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; VI-NEXT:    v_add_f32_e32 v14, v11, v9
+; VI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -12, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v10
+; VI-NEXT:    v_ldexp_f32 v11, v11, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB12_5
+; VI-NEXT:  ; %bb.6: ; %Flow134
+; VI-NEXT:    v_mov_b32_e32 v11, v13
+; VI-NEXT:  .LBB12_7: ; %frem.loop_exit
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -11, v10
+; VI-NEXT:    v_ldexp_f32 v10, v11, v10
+; VI-NEXT:    v_mul_f32_e32 v11, v10, v12
+; VI-NEXT:    v_rndne_f32_e32 v11, v11
+; VI-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; VI-NEXT:    v_add_f32_e32 v9, v10, v9
+; VI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; VI-NEXT:    v_ldexp_f32 v8, v9, v8
+; VI-NEXT:    v_and_b32_e32 v9, 0x80000000, v0
+; VI-NEXT:    v_xor_b32_e32 v8, v9, v8
+; VI-NEXT:  .LBB12_8:
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v5|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB12_10
+; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    v_bfi_b32 v9, s0, 0, v1
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
+; VI-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
+; VI-NEXT:    s_cbranch_execz .LBB12_11
+; VI-NEXT:    s_branch .LBB12_16
+; VI-NEXT:  .LBB12_10:
+; VI-NEXT:    ; implicit-def: $vgpr9
+; VI-NEXT:  .LBB12_11: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f32_e64 v10, |v5|
+; VI-NEXT:    v_ldexp_f32 v10, v10, 1
+; VI-NEXT:    v_div_scale_f32 v16, s[0:1], v10, v10, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v9, |v1|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v15, v5
+; VI-NEXT:    v_ldexp_f32 v12, v9, 12
+; VI-NEXT:    v_add_u32_e32 v9, vcc, -1, v15
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v14, v1
+; VI-NEXT:    v_not_b32_e32 v11, v9
+; VI-NEXT:    v_add_u32_e32 v11, vcc, v11, v14
+; VI-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v10, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v17, v16
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
-; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
-; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
-; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
-; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
-; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
+; VI-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; VI-NEXT:    v_fma_f32 v17, v18, v17, v17
+; VI-NEXT:    v_mul_f32_e32 v18, v13, v17
+; VI-NEXT:    v_fma_f32 v19, -v16, v18, v13
+; VI-NEXT:    v_fma_f32 v18, v19, v17, v18
+; VI-NEXT:    v_fma_f32 v13, -v16, v18, v13
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
-; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; VI-NEXT:    v_trunc_f32_e32 v7, v7
-; VI-NEXT:    v_fma_f32 v2, -v7, v6, v2
-; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
-; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
-; VI-NEXT:    v_rcp_f32_e32 v10, v7
+; VI-NEXT:    v_div_fmas_f32 v13, v13, v17, v18
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v11
+; VI-NEXT:    v_div_fixup_f32 v13, v13, v10, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_sub_u32_e32 v11, vcc, v14, v15
+; VI-NEXT:    v_add_u32_e32 v11, vcc, 12, v11
+; VI-NEXT:  .LBB12_13: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v14, v12
+; VI-NEXT:    v_mul_f32_e32 v12, v14, v13
+; VI-NEXT:    v_rndne_f32_e32 v12, v12
+; VI-NEXT:    v_fma_f32 v12, -v12, v10, v14
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; VI-NEXT:    v_add_f32_e32 v15, v12, v10
+; VI-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; VI-NEXT:    v_add_u32_e32 v11, vcc, -12, v11
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v11
+; VI-NEXT:    v_ldexp_f32 v12, v12, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB12_13
+; VI-NEXT:  ; %bb.14: ; %Flow130
+; VI-NEXT:    v_mov_b32_e32 v12, v14
+; VI-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT:    v_add_u32_e32 v11, vcc, -11, v11
+; VI-NEXT:    v_ldexp_f32 v11, v12, v11
+; VI-NEXT:    v_mul_f32_e32 v12, v11, v13
+; VI-NEXT:    v_rndne_f32_e32 v12, v12
+; VI-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; VI-NEXT:    v_add_f32_e32 v10, v11, v10
+; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; VI-NEXT:    v_ldexp_f32 v9, v10, v9
+; VI-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; VI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; VI-NEXT:  .LBB12_16:
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v2|, |v6|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB12_18
+; VI-NEXT:  ; %bb.17: ; %frem.else50
+; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    v_bfi_b32 v10, s0, 0, v2
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
+; VI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
+; VI-NEXT:    s_cbranch_execz .LBB12_19
+; VI-NEXT:    s_branch .LBB12_24
+; VI-NEXT:  .LBB12_18:
+; VI-NEXT:    ; implicit-def: $vgpr10
+; VI-NEXT:  .LBB12_19: ; %frem.compute49
+; VI-NEXT:    v_frexp_mant_f32_e64 v11, |v6|
+; VI-NEXT:    v_ldexp_f32 v11, v11, 1
+; VI-NEXT:    v_div_scale_f32 v17, s[0:1], v11, v11, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v10, |v2|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v6
+; VI-NEXT:    v_ldexp_f32 v13, v10, 12
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -1, v16
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v15, v2
+; VI-NEXT:    v_not_b32_e32 v12, v10
+; VI-NEXT:    v_add_u32_e32 v12, vcc, v12, v15
+; VI-NEXT:    v_div_scale_f32 v14, vcc, 1.0, v11, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v18, v17
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
-; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
-; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
-; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
-; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
-; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
+; VI-NEXT:    v_fma_f32 v19, -v17, v18, 1.0
+; VI-NEXT:    v_fma_f32 v18, v19, v18, v18
+; VI-NEXT:    v_mul_f32_e32 v19, v14, v18
+; VI-NEXT:    v_fma_f32 v20, -v17, v19, v14
+; VI-NEXT:    v_fma_f32 v19, v20, v18, v19
+; VI-NEXT:    v_fma_f32 v14, -v17, v19, v14
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
-; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
-; VI-NEXT:    v_trunc_f32_e32 v6, v6
-; VI-NEXT:    v_fma_f32 v1, -v6, v5, v1
-; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
-; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
-; VI-NEXT:    v_rcp_f32_e32 v7, v6
+; VI-NEXT:    v_div_fmas_f32 v14, v14, v18, v19
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v12
+; VI-NEXT:    v_div_fixup_f32 v14, v14, v11, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_23
+; VI-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; VI-NEXT:    v_sub_u32_e32 v12, vcc, v15, v16
+; VI-NEXT:    v_add_u32_e32 v12, vcc, 12, v12
+; VI-NEXT:  .LBB12_21: ; %frem.loop_body57
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v15, v13
+; VI-NEXT:    v_mul_f32_e32 v13, v15, v14
+; VI-NEXT:    v_rndne_f32_e32 v13, v13
+; VI-NEXT:    v_fma_f32 v13, -v13, v11, v15
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; VI-NEXT:    v_add_f32_e32 v16, v13, v11
+; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; VI-NEXT:    v_add_u32_e32 v12, vcc, -12, v12
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v12
+; VI-NEXT:    v_ldexp_f32 v13, v13, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB12_21
+; VI-NEXT:  ; %bb.22: ; %Flow126
+; VI-NEXT:    v_mov_b32_e32 v13, v15
+; VI-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; VI-NEXT:    v_add_u32_e32 v12, vcc, -11, v12
+; VI-NEXT:    v_ldexp_f32 v12, v13, v12
+; VI-NEXT:    v_mul_f32_e32 v13, v12, v14
+; VI-NEXT:    v_rndne_f32_e32 v13, v13
+; VI-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; VI-NEXT:    v_add_f32_e32 v11, v12, v11
+; VI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; VI-NEXT:    v_ldexp_f32 v10, v11, v10
+; VI-NEXT:    v_and_b32_e32 v11, 0x80000000, v2
+; VI-NEXT:    v_xor_b32_e32 v10, v11, v10
+; VI-NEXT:  .LBB12_24:
+; VI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v3|, |v7|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB12_26
+; VI-NEXT:  ; %bb.25: ; %frem.else84
+; VI-NEXT:    s_brev_b32 s0, -2
+; VI-NEXT:    v_bfi_b32 v11, s0, 0, v3
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
+; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
+; VI-NEXT:    s_cbranch_execz .LBB12_27
+; VI-NEXT:    s_branch .LBB12_32
+; VI-NEXT:  .LBB12_26:
+; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:  .LBB12_27: ; %frem.compute83
+; VI-NEXT:    v_frexp_mant_f32_e64 v12, |v7|
+; VI-NEXT:    v_ldexp_f32 v12, v12, 1
+; VI-NEXT:    v_div_scale_f32 v18, s[0:1], v12, v12, 1.0
+; VI-NEXT:    v_frexp_mant_f32_e64 v11, |v3|
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v17, v7
+; VI-NEXT:    v_ldexp_f32 v14, v11, 12
+; VI-NEXT:    v_add_u32_e32 v11, vcc, -1, v17
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v16, v3
+; VI-NEXT:    v_not_b32_e32 v13, v11
+; VI-NEXT:    v_add_u32_e32 v13, vcc, v13, v16
+; VI-NEXT:    v_div_scale_f32 v15, vcc, 1.0, v12, 1.0
+; VI-NEXT:    v_rcp_f32_e32 v19, v18
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
-; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
-; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
-; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
-; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
-; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
+; VI-NEXT:    v_fma_f32 v20, -v18, v19, 1.0
+; VI-NEXT:    v_fma_f32 v19, v20, v19, v19
+; VI-NEXT:    v_mul_f32_e32 v20, v15, v19
+; VI-NEXT:    v_fma_f32 v21, -v18, v20, v15
+; VI-NEXT:    v_fma_f32 v20, v21, v19, v20
+; VI-NEXT:    v_fma_f32 v15, -v18, v20, v15
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
-; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
-; VI-NEXT:    v_trunc_f32_e32 v5, v5
-; VI-NEXT:    v_fma_f32 v0, -v5, v4, v0
-; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT:    v_div_fmas_f32 v15, v15, v19, v20
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v13
+; VI-NEXT:    v_div_fixup_f32 v15, v15, v12, 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB12_31
+; VI-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; VI-NEXT:    v_sub_u32_e32 v13, vcc, v16, v17
+; VI-NEXT:    v_add_u32_e32 v13, vcc, 12, v13
+; VI-NEXT:  .LBB12_29: ; %frem.loop_body91
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v16, v14
+; VI-NEXT:    v_mul_f32_e32 v14, v16, v15
+; VI-NEXT:    v_rndne_f32_e32 v14, v14
+; VI-NEXT:    v_fma_f32 v14, -v14, v12, v16
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v14
+; VI-NEXT:    v_add_f32_e32 v17, v14, v12
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
+; VI-NEXT:    v_add_u32_e32 v13, vcc, -12, v13
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v13
+; VI-NEXT:    v_ldexp_f32 v14, v14, 12
+; VI-NEXT:    s_cbranch_vccnz .LBB12_29
+; VI-NEXT:  ; %bb.30: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v14, v16
+; VI-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; VI-NEXT:    v_add_u32_e32 v13, vcc, -11, v13
+; VI-NEXT:    v_ldexp_f32 v13, v14, v13
+; VI-NEXT:    v_mul_f32_e32 v14, v13, v15
+; VI-NEXT:    v_rndne_f32_e32 v14, v14
+; VI-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; VI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; VI-NEXT:    v_add_f32_e32 v12, v13, v12
+; VI-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; VI-NEXT:    v_ldexp_f32 v11, v12, v11
+; VI-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; VI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; VI-NEXT:  .LBB12_32: ; %Flow125
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v13, 0x1f8
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v4, v12
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v13
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v4
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v14, 0x7fc00000
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v5, v12
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v13
+; VI-NEXT:    v_cndmask_b32_e32 v0, v14, v8, vcc
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v5
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v6, v12
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v2, v13
+; VI-NEXT:    v_cndmask_b32_e32 v1, v14, v9, vcc
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v6
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cmp_class_f32_e64 s[0:1], v7, v12
+; VI-NEXT:    v_cmp_class_f32_e64 s[2:3], v3, v13
+; VI-NEXT:    v_cndmask_b32_e32 v2, v14, v10, vcc
+; VI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v7
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s8
+; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    v_cndmask_b32_e32 v3, v14, v11, vcc
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[10:11]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_div_scale_f32 v10, s[2:3], v7, v7, v3
-; GFX9-NEXT:    v_div_scale_f32 v9, vcc, v3, v7, v3
-; GFX9-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v4|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB12_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB12_3
+; GFX9-NEXT:    s_branch .LBB12_8
+; GFX9-NEXT:  .LBB12_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr8
+; GFX9-NEXT:  .LBB12_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v9, |v4|
+; GFX9-NEXT:    v_ldexp_f32 v9, v9, 1
+; GFX9-NEXT:    v_div_scale_f32 v15, s[0:1], v9, v9, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v12, vcc, 1.0, v9, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v8, |v0|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v13, v0
+; GFX9-NEXT:    v_ldexp_f32 v11, v8, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v14, v4
+; GFX9-NEXT:    v_add_u32_e32 v8, -1, v14
+; GFX9-NEXT:    v_not_b32_e32 v10, v8
+; GFX9-NEXT:    v_add_u32_e32 v10, v10, v13
+; GFX9-NEXT:    v_rcp_f32_e32 v16, v15
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
-; GFX9-NEXT:    v_fma_f32 v11, v12, v11, v11
-; GFX9-NEXT:    v_mul_f32_e32 v12, v9, v11
-; GFX9-NEXT:    v_fma_f32 v13, -v10, v12, v9
-; GFX9-NEXT:    v_fma_f32 v12, v13, v11, v12
-; GFX9-NEXT:    v_fma_f32 v9, -v10, v12, v9
+; GFX9-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
+; GFX9-NEXT:    v_fma_f32 v16, v17, v16, v16
+; GFX9-NEXT:    v_mul_f32_e32 v17, v12, v16
+; GFX9-NEXT:    v_fma_f32 v18, -v15, v17, v12
+; GFX9-NEXT:    v_fma_f32 v17, v18, v16, v17
+; GFX9-NEXT:    v_fma_f32 v12, -v15, v17, v12
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
-; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
-; GFX9-NEXT:    v_fma_f32 v3, -v9, v7, v3
-; GFX9-NEXT:    v_div_scale_f32 v9, s[2:3], v6, v6, v2
-; GFX9-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
-; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX9-NEXT:    v_div_fmas_f32 v12, v12, v16, v17
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v10
+; GFX9-NEXT:    v_div_fixup_f32 v12, v12, v9, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v10, v13, v14
+; GFX9-NEXT:    v_add_u32_e32 v10, 12, v10
+; GFX9-NEXT:  .LBB12_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-NEXT:    v_mul_f32_e32 v11, v13, v12
+; GFX9-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX9-NEXT:    v_fma_f32 v11, -v11, v9, v13
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; GFX9-NEXT:    v_add_f32_e32 v14, v11, v9
+; GFX9-NEXT:    v_add_u32_e32 v10, -12, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v10
+; GFX9-NEXT:    v_ldexp_f32 v11, v11, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow134
+; GFX9-NEXT:    v_mov_b32_e32 v11, v13
+; GFX9-NEXT:  .LBB12_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_add_u32_e32 v10, -11, v10
+; GFX9-NEXT:    v_ldexp_f32 v10, v11, v10
+; GFX9-NEXT:    v_mul_f32_e32 v11, v10, v12
+; GFX9-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX9-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_ldexp_f32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x80000000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GFX9-NEXT:  .LBB12_8:
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v1|, |v5|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB12_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else16
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v9, s0, 0, v1
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB12_11
+; GFX9-NEXT:    s_branch .LBB12_16
+; GFX9-NEXT:  .LBB12_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr9
+; GFX9-NEXT:  .LBB12_11: ; %frem.compute15
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v10, |v5|
+; GFX9-NEXT:    v_ldexp_f32 v10, v10, 1
+; GFX9-NEXT:    v_div_scale_f32 v16, s[0:1], v10, v10, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v13, vcc, 1.0, v10, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v9, |v1|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v14, v1
+; GFX9-NEXT:    v_ldexp_f32 v12, v9, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v15, v5
+; GFX9-NEXT:    v_add_u32_e32 v9, -1, v15
+; GFX9-NEXT:    v_not_b32_e32 v11, v9
+; GFX9-NEXT:    v_add_u32_e32 v11, v11, v14
+; GFX9-NEXT:    v_rcp_f32_e32 v17, v16
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
-; GFX9-NEXT:    v_mul_f32_e32 v11, v7, v10
-; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v7
-; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
-; GFX9-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX9-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; GFX9-NEXT:    v_fma_f32 v17, v18, v17, v17
+; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v17
+; GFX9-NEXT:    v_fma_f32 v19, -v16, v18, v13
+; GFX9-NEXT:    v_fma_f32 v18, v19, v17, v18
+; GFX9-NEXT:    v_fma_f32 v13, -v16, v18, v13
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
-; GFX9-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_fma_f32 v2, -v7, v6, v2
-; GFX9-NEXT:    v_div_scale_f32 v7, s[2:3], v5, v5, v1
-; GFX9-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v9, v7
+; GFX9-NEXT:    v_div_fmas_f32 v13, v13, v17, v18
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v11
+; GFX9-NEXT:    v_div_fixup_f32 v13, v13, v10, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v11, v14, v15
+; GFX9-NEXT:    v_add_u32_e32 v11, 12, v11
+; GFX9-NEXT:  .LBB12_13: ; %frem.loop_body23
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v14, v13
+; GFX9-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX9-NEXT:    v_fma_f32 v12, -v12, v10, v14
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_add_f32_e32 v15, v12, v10
+; GFX9-NEXT:    v_add_u32_e32 v11, -12, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v11
+; GFX9-NEXT:    v_ldexp_f32 v12, v12, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow130
+; GFX9-NEXT:    v_mov_b32_e32 v12, v14
+; GFX9-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; GFX9-NEXT:    v_add_u32_e32 v11, -11, v11
+; GFX9-NEXT:    v_ldexp_f32 v11, v12, v11
+; GFX9-NEXT:    v_mul_f32_e32 v12, v11, v13
+; GFX9-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX9-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v11
+; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX9-NEXT:    v_ldexp_f32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v9, v10, v9
+; GFX9-NEXT:  .LBB12_16:
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v2|, |v6|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB12_18
+; GFX9-NEXT:  ; %bb.17: ; %frem.else50
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v10, s0, 0, v2
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB12_19
+; GFX9-NEXT:    s_branch .LBB12_24
+; GFX9-NEXT:  .LBB12_18:
+; GFX9-NEXT:    ; implicit-def: $vgpr10
+; GFX9-NEXT:  .LBB12_19: ; %frem.compute49
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v11, |v6|
+; GFX9-NEXT:    v_ldexp_f32 v11, v11, 1
+; GFX9-NEXT:    v_div_scale_f32 v17, s[0:1], v11, v11, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v14, vcc, 1.0, v11, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v10, |v2|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v15, v2
+; GFX9-NEXT:    v_ldexp_f32 v13, v10, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v16, v6
+; GFX9-NEXT:    v_add_u32_e32 v10, -1, v16
+; GFX9-NEXT:    v_not_b32_e32 v12, v10
+; GFX9-NEXT:    v_add_u32_e32 v12, v12, v15
+; GFX9-NEXT:    v_rcp_f32_e32 v18, v17
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
-; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
-; GFX9-NEXT:    v_mul_f32_e32 v10, v6, v9
-; GFX9-NEXT:    v_fma_f32 v11, -v7, v10, v6
-; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
-; GFX9-NEXT:    v_fma_f32 v6, -v7, v10, v6
+; GFX9-NEXT:    v_fma_f32 v19, -v17, v18, 1.0
+; GFX9-NEXT:    v_fma_f32 v18, v19, v18, v18
+; GFX9-NEXT:    v_mul_f32_e32 v19, v14, v18
+; GFX9-NEXT:    v_fma_f32 v20, -v17, v19, v14
+; GFX9-NEXT:    v_fma_f32 v19, v20, v18, v19
+; GFX9-NEXT:    v_fma_f32 v14, -v17, v19, v14
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
-; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_fma_f32 v1, -v6, v5, v1
-; GFX9-NEXT:    v_div_scale_f32 v6, s[2:3], v4, v4, v0
-; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
-; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX9-NEXT:    v_div_fmas_f32 v14, v14, v18, v19
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v12
+; GFX9-NEXT:    v_div_fixup_f32 v14, v14, v11, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_23
+; GFX9-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v12, v15, v16
+; GFX9-NEXT:    v_add_u32_e32 v12, 12, v12
+; GFX9-NEXT:  .LBB12_21: ; %frem.loop_body57
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v13
+; GFX9-NEXT:    v_mul_f32_e32 v13, v15, v14
+; GFX9-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX9-NEXT:    v_fma_f32 v13, -v13, v11, v15
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; GFX9-NEXT:    v_add_f32_e32 v16, v13, v11
+; GFX9-NEXT:    v_add_u32_e32 v12, -12, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v12
+; GFX9-NEXT:    v_ldexp_f32 v13, v13, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_21
+; GFX9-NEXT:  ; %bb.22: ; %Flow126
+; GFX9-NEXT:    v_mov_b32_e32 v13, v15
+; GFX9-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; GFX9-NEXT:    v_add_u32_e32 v12, -11, v12
+; GFX9-NEXT:    v_ldexp_f32 v12, v13, v12
+; GFX9-NEXT:    v_mul_f32_e32 v13, v12, v14
+; GFX9-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX9-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX9-NEXT:    v_ldexp_f32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x80000000, v2
+; GFX9-NEXT:    v_xor_b32_e32 v10, v11, v10
+; GFX9-NEXT:  .LBB12_24:
+; GFX9-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v3|, |v7|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB12_26
+; GFX9-NEXT:  ; %bb.25: ; %frem.else84
+; GFX9-NEXT:    s_brev_b32 s0, -2
+; GFX9-NEXT:    v_bfi_b32 v11, s0, 0, v3
+; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB12_27
+; GFX9-NEXT:    s_branch .LBB12_32
+; GFX9-NEXT:  .LBB12_26:
+; GFX9-NEXT:    ; implicit-def: $vgpr11
+; GFX9-NEXT:  .LBB12_27: ; %frem.compute83
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v12, |v7|
+; GFX9-NEXT:    v_ldexp_f32 v12, v12, 1
+; GFX9-NEXT:    v_div_scale_f32 v18, s[0:1], v12, v12, 1.0
+; GFX9-NEXT:    v_div_scale_f32 v15, vcc, 1.0, v12, 1.0
+; GFX9-NEXT:    v_frexp_mant_f32_e64 v11, |v3|
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v16, v3
+; GFX9-NEXT:    v_ldexp_f32 v14, v11, 12
+; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v17, v7
+; GFX9-NEXT:    v_add_u32_e32 v11, -1, v17
+; GFX9-NEXT:    v_not_b32_e32 v13, v11
+; GFX9-NEXT:    v_add_u32_e32 v13, v13, v16
+; GFX9-NEXT:    v_rcp_f32_e32 v19, v18
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX9-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
-; GFX9-NEXT:    v_fma_f32 v7, v9, v7, v7
-; GFX9-NEXT:    v_mul_f32_e32 v9, v5, v7
-; GFX9-NEXT:    v_fma_f32 v10, -v6, v9, v5
-; GFX9-NEXT:    v_fma_f32 v9, v10, v7, v9
-; GFX9-NEXT:    v_fma_f32 v5, -v6, v9, v5
+; GFX9-NEXT:    v_fma_f32 v20, -v18, v19, 1.0
+; GFX9-NEXT:    v_fma_f32 v19, v20, v19, v19
+; GFX9-NEXT:    v_mul_f32_e32 v20, v15, v19
+; GFX9-NEXT:    v_fma_f32 v21, -v18, v20, v15
+; GFX9-NEXT:    v_fma_f32 v20, v21, v19, v20
+; GFX9-NEXT:    v_fma_f32 v15, -v18, v20, v15
 ; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
-; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_fma_f32 v0, -v5, v4, v0
-; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX9-NEXT:    v_div_fmas_f32 v15, v15, v19, v20
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 13, v13
+; GFX9-NEXT:    v_div_fixup_f32 v15, v15, v12, 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_31
+; GFX9-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v13, v16, v17
+; GFX9-NEXT:    v_add_u32_e32 v13, 12, v13
+; GFX9-NEXT:  .LBB12_29: ; %frem.loop_body91
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v14
+; GFX9-NEXT:    v_mul_f32_e32 v14, v16, v15
+; GFX9-NEXT:    v_rndne_f32_e32 v14, v14
+; GFX9-NEXT:    v_fma_f32 v14, -v14, v12, v16
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v14
+; GFX9-NEXT:    v_add_f32_e32 v17, v14, v12
+; GFX9-NEXT:    v_add_u32_e32 v13, -12, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 12, v13
+; GFX9-NEXT:    v_ldexp_f32 v14, v14, 12
+; GFX9-NEXT:    s_cbranch_vccnz .LBB12_29
+; GFX9-NEXT:  ; %bb.30: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v14, v16
+; GFX9-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; GFX9-NEXT:    v_add_u32_e32 v13, -11, v13
+; GFX9-NEXT:    v_ldexp_f32 v13, v14, v13
+; GFX9-NEXT:    v_mul_f32_e32 v14, v13, v15
+; GFX9-NEXT:    v_rndne_f32_e32 v14, v14
+; GFX9-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v13
+; GFX9-NEXT:    v_add_f32_e32 v12, v13, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; GFX9-NEXT:    v_ldexp_f32 v11, v12, v11
+; GFX9-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; GFX9-NEXT:    v_xor_b32_e32 v11, v12, v11
+; GFX9-NEXT:  .LBB12_32: ; %Flow125
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v4, v12
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, v4
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0x7fc00000
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v5, v12
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v13, v8, vcc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v6, v12
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v13, v9, vcc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[0:1], v7, v12
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[2:3], v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v13, v10, vcc
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v7
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v13, v11, vcc
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_v4f32:
@@ -3591,67 +13172,312 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:64
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f32 v10, s2, v7, v7, v3
-; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v4|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB12_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_bfi_b32 v8, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB12_3
+; GFX10-NEXT:    s_branch .LBB12_8
+; GFX10-NEXT:  .LBB12_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr8
+; GFX10-NEXT:  .LBB12_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v9, |v4|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v8, |v0|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v11, v0
+; GFX10-NEXT:    v_ldexp_f32 v9, v9, 1
+; GFX10-NEXT:    v_ldexp_f32 v10, v8, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v8, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v11
+; GFX10-NEXT:    v_div_scale_f32 v13, s4, v9, v9, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, -1, v8
+; GFX10-NEXT:    v_rcp_f32_e32 v14, v13
+; GFX10-NEXT:    v_not_b32_e32 v12, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v12, v12, v11
+; GFX10-NEXT:    v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v11
-; GFX10-NEXT:    v_mul_f32_e32 v12, v9, v11
-; GFX10-NEXT:    v_fma_f32 v13, -v10, v12, v9
-; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v11
-; GFX10-NEXT:    v_fma_f32 v9, -v10, v12, v9
+; GFX10-NEXT:    v_fma_f32 v15, -v13, v14, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v14, v15, v14
+; GFX10-NEXT:    v_mul_f32_e32 v15, v11, v14
+; GFX10-NEXT:    v_fma_f32 v16, -v13, v15, v11
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v16, v14
+; GFX10-NEXT:    v_fma_f32 v11, -v13, v15, v11
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
-; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
-; GFX10-NEXT:    v_trunc_f32_e32 v9, v9
-; GFX10-NEXT:    v_fma_f32 v3, -v9, v7, v3
-; GFX10-NEXT:    v_div_scale_f32 v9, s2, v6, v6, v2
-; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX10-NEXT:    v_div_fmas_f32 v11, v11, v14, v15
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v12
+; GFX10-NEXT:    v_div_fixup_f32 v11, v11, v9, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB12_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB12_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v10
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v10, v13, v11
+; GFX10-NEXT:    v_rndne_f32_e32 v10, v10
+; GFX10-NEXT:    v_fma_f32 v10, -v10, v9, v13
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_add_f32_e32 v12, v10, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v10, v10, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB12_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow134
+; GFX10-NEXT:    v_mov_b32_e32 v12, s2
+; GFX10-NEXT:    v_mov_b32_e32 v10, v13
+; GFX10-NEXT:  .LBB12_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_add_nc_u32_e32 v12, -11, v12
+; GFX10-NEXT:    v_ldexp_f32 v10, v10, v12
+; GFX10-NEXT:    v_mul_f32_e32 v11, v10, v11
+; GFX10-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX10-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v8, v9, v8
+; GFX10-NEXT:    v_and_b32_e32 v9, 0x80000000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GFX10-NEXT:  .LBB12_8:
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v1|, |v5|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB12_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else16
+; GFX10-NEXT:    v_bfi_b32 v9, 0x7fffffff, 0, v1
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB12_11
+; GFX10-NEXT:    s_branch .LBB12_16
+; GFX10-NEXT:  .LBB12_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr9
+; GFX10-NEXT:  .LBB12_11: ; %frem.compute15
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v10, |v5|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v9, |v1|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v12, v1
+; GFX10-NEXT:    v_ldexp_f32 v10, v10, 1
+; GFX10-NEXT:    v_ldexp_f32 v11, v9, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v9, v5
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v12
+; GFX10-NEXT:    v_div_scale_f32 v14, s4, v10, v10, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX10-NEXT:    v_rcp_f32_e32 v15, v14
+; GFX10-NEXT:    v_not_b32_e32 v13, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, v13, v12
+; GFX10-NEXT:    v_div_scale_f32 v12, vcc_lo, 1.0, v10, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
-; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
-; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
-; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
-; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX10-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v16, v15
+; GFX10-NEXT:    v_mul_f32_e32 v16, v12, v15
+; GFX10-NEXT:    v_fma_f32 v17, -v14, v16, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v16, v17, v15
+; GFX10-NEXT:    v_fma_f32 v12, -v14, v16, v12
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
-; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; GFX10-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX10-NEXT:    v_fma_f32 v2, -v7, v6, v2
-; GFX10-NEXT:    v_div_scale_f32 v7, s2, v5, v5, v1
-; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v9, v7
+; GFX10-NEXT:    v_div_fmas_f32 v12, v12, v15, v16
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v13
+; GFX10-NEXT:    v_div_fixup_f32 v12, v12, v10, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB12_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB12_13: ; %frem.loop_body23
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v11
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v11, v14, v12
+; GFX10-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX10-NEXT:    v_fma_f32 v11, -v11, v10, v14
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_add_f32_e32 v13, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v11, v11, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB12_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow130
+; GFX10-NEXT:    v_mov_b32_e32 v13, s2
+; GFX10-NEXT:    v_mov_b32_e32 v11, v14
+; GFX10-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; GFX10-NEXT:    v_add_nc_u32_e32 v13, -11, v13
+; GFX10-NEXT:    v_ldexp_f32 v11, v11, v13
+; GFX10-NEXT:    v_mul_f32_e32 v12, v11, v12
+; GFX10-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX10-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v9, v10, v9
+; GFX10-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; GFX10-NEXT:    v_xor_b32_e32 v9, v10, v9
+; GFX10-NEXT:  .LBB12_16:
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v2|, |v6|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB12_18
+; GFX10-NEXT:  ; %bb.17: ; %frem.else50
+; GFX10-NEXT:    v_bfi_b32 v10, 0x7fffffff, 0, v2
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB12_19
+; GFX10-NEXT:    s_branch .LBB12_24
+; GFX10-NEXT:  .LBB12_18:
+; GFX10-NEXT:    ; implicit-def: $vgpr10
+; GFX10-NEXT:  .LBB12_19: ; %frem.compute49
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v11, |v6|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v10, |v2|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v13, v2
+; GFX10-NEXT:    v_ldexp_f32 v11, v11, 1
+; GFX10-NEXT:    v_ldexp_f32 v12, v10, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v10, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v13
+; GFX10-NEXT:    v_div_scale_f32 v15, s4, v11, v11, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX10-NEXT:    v_rcp_f32_e32 v16, v15
+; GFX10-NEXT:    v_not_b32_e32 v14, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, v14, v13
+; GFX10-NEXT:    v_div_scale_f32 v13, vcc_lo, 1.0, v11, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v9
-; GFX10-NEXT:    v_mul_f32_e32 v10, v6, v9
-; GFX10-NEXT:    v_fma_f32 v11, -v7, v10, v6
-; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v9
-; GFX10-NEXT:    v_fma_f32 v6, -v7, v10, v6
+; GFX10-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_mul_f32_e32 v17, v13, v16
+; GFX10-NEXT:    v_fma_f32 v18, -v15, v17, v13
+; GFX10-NEXT:    v_fmac_f32_e32 v17, v18, v16
+; GFX10-NEXT:    v_fma_f32 v13, -v15, v17, v13
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
-; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX10-NEXT:    v_fma_f32 v1, -v6, v5, v1
-; GFX10-NEXT:    v_div_scale_f32 v6, s2, v4, v4, v0
-; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX10-NEXT:    v_div_fmas_f32 v13, v13, v16, v17
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v14
+; GFX10-NEXT:    v_div_fixup_f32 v13, v13, v11, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB12_23
+; GFX10-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB12_21: ; %frem.loop_body57
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v12
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v12, v15, v13
+; GFX10-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX10-NEXT:    v_fma_f32 v12, -v12, v11, v15
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_add_f32_e32 v14, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v12, v12, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB12_21
+; GFX10-NEXT:  ; %bb.22: ; %Flow126
+; GFX10-NEXT:    v_mov_b32_e32 v14, s2
+; GFX10-NEXT:    v_mov_b32_e32 v12, v15
+; GFX10-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; GFX10-NEXT:    v_add_nc_u32_e32 v14, -11, v14
+; GFX10-NEXT:    v_ldexp_f32 v12, v12, v14
+; GFX10-NEXT:    v_mul_f32_e32 v13, v12, v13
+; GFX10-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX10-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v10, v11, v10
+; GFX10-NEXT:    v_and_b32_e32 v11, 0x80000000, v2
+; GFX10-NEXT:    v_xor_b32_e32 v10, v11, v10
+; GFX10-NEXT:  .LBB12_24:
+; GFX10-NEXT:    v_cmp_ngt_f32_e64 s2, |v3|, |v7|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB12_26
+; GFX10-NEXT:  ; %bb.25: ; %frem.else84
+; GFX10-NEXT:    v_bfi_b32 v11, 0x7fffffff, 0, v3
+; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB12_27
+; GFX10-NEXT:    s_branch .LBB12_32
+; GFX10-NEXT:  .LBB12_26:
+; GFX10-NEXT:    ; implicit-def: $vgpr11
+; GFX10-NEXT:  .LBB12_27: ; %frem.compute83
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v12, |v7|
+; GFX10-NEXT:    v_frexp_mant_f32_e64 v11, |v3|
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v14, v3
+; GFX10-NEXT:    v_ldexp_f32 v12, v12, 1
+; GFX10-NEXT:    v_ldexp_f32 v13, v11, 12
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v11, v7
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v14
+; GFX10-NEXT:    v_div_scale_f32 v16, s4, v12, v12, 1.0
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX10-NEXT:    v_rcp_f32_e32 v17, v16
+; GFX10-NEXT:    v_not_b32_e32 v15, v11
+; GFX10-NEXT:    v_add_nc_u32_e32 v15, v15, v14
+; GFX10-NEXT:    v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
-; GFX10-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v7
-; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v7
-; GFX10-NEXT:    v_fma_f32 v10, -v6, v9, v5
-; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v7
-; GFX10-NEXT:    v_fma_f32 v5, -v6, v9, v5
+; GFX10-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_mul_f32_e32 v18, v14, v17
+; GFX10-NEXT:    v_fma_f32 v19, -v16, v18, v14
+; GFX10-NEXT:    v_fmac_f32_e32 v18, v19, v17
+; GFX10-NEXT:    v_fma_f32 v14, -v16, v18, v14
 ; GFX10-NEXT:    s_denorm_mode 12
-; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
-; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_fma_f32 v0, -v5, v4, v0
-; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX10-NEXT:    v_div_fmas_f32 v14, v14, v17, v18
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v15
+; GFX10-NEXT:    v_div_fixup_f32 v14, v14, v12, 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB12_31
+; GFX10-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 12
+; GFX10-NEXT:  .LBB12_29: ; %frem.loop_body91
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v13
+; GFX10-NEXT:    s_add_i32 s2, s2, -12
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX10-NEXT:    v_mul_f32_e32 v13, v16, v14
+; GFX10-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX10-NEXT:    v_fma_f32 v13, -v13, v12, v16
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_add_f32_e32 v15, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v13, v13, 12
+; GFX10-NEXT:    s_cbranch_scc1 .LBB12_29
+; GFX10-NEXT:  ; %bb.30: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v15, s2
+; GFX10-NEXT:    v_mov_b32_e32 v13, v16
+; GFX10-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; GFX10-NEXT:    v_add_nc_u32_e32 v15, -11, v15
+; GFX10-NEXT:    v_ldexp_f32 v13, v13, v15
+; GFX10-NEXT:    v_mul_f32_e32 v14, v13, v14
+; GFX10-NEXT:    v_rndne_f32_e32 v14, v14
+; GFX10-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v13
+; GFX10-NEXT:    v_add_f32_e32 v12, v13, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v11, v12, v11
+; GFX10-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; GFX10-NEXT:    v_xor_b32_e32 v11, v12, v11
+; GFX10-NEXT:  .LBB12_32: ; %Flow125
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v4, 0x3fc
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v1, 0x1f8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v5, 0x3fc
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v2, 0x1f8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v6, 0x3fc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_cmp_class_f32_e64 s3, v3, 0x1f8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f32_e64 s2, v7, 0x3fc
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v11, vcc_lo
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_v4f32:
@@ -3659,199 +13485,838 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
-; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[4:5] offset:64
+; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[4:5] offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
-; GFX11-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v0|, |v4|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB12_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_bfi_b32 v8, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB12_3
+; GFX11-NEXT:    s_branch .LBB12_8
+; GFX11-NEXT:  .LBB12_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr8
+; GFX11-NEXT:  .LBB12_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v9, |v4|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v8, |v0|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v11, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v9, v9, 1
+; GFX11-NEXT:    v_ldexp_f32 v10, v8, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v8, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v11
+; GFX11-NEXT:    v_div_scale_f32 v13, null, v9, v9, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, -1, v8
+; GFX11-NEXT:    v_rcp_f32_e32 v14, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v12, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v11
+; GFX11-NEXT:    v_div_scale_f32 v11, vcc_lo, 1.0, v9, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v11
+; GFX11-NEXT:    v_fma_f32 v15, -v13, v14, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v12, v9, v11
-; GFX11-NEXT:    v_fma_f32 v13, -v10, v12, v9
+; GFX11-NEXT:    v_fmac_f32_e32 v14, v15, v14
+; GFX11-NEXT:    v_mul_f32_e32 v15, v11, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v12, v13, v11
-; GFX11-NEXT:    v_fma_f32 v9, -v10, v12, v9
+; GFX11-NEXT:    v_fma_f32 v16, -v13, v15, v11
+; GFX11-NEXT:    v_fmac_f32_e32 v15, v16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v11, -v13, v15, v11
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v11, v11, v14, v15
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v11, v11, v9, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB12_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB12_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v13, v10
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
-; GFX11-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
+; GFX11-NEXT:    v_mul_f32_e32 v10, v13, v11
+; GFX11-NEXT:    v_rndne_f32_e32 v10, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v9, v9
-; GFX11-NEXT:    v_fma_f32 v3, -v9, v7, v3
-; GFX11-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
-; GFX11-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX11-NEXT:    v_fma_f32 v10, -v10, v9, v13
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_add_f32_e32 v12, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v10, v10, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB12_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow134
+; GFX11-NEXT:    v_mov_b32_e32 v12, s2
+; GFX11-NEXT:    v_mov_b32_e32 v10, v13
+; GFX11-NEXT:  .LBB12_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, -11, v12
+; GFX11-NEXT:    v_ldexp_f32 v10, v10, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v11, v10, v11
+; GFX11-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v10, -v11, v9, v10
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v10
+; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v8, v9, v8
+; GFX11-NEXT:    v_and_b32_e32 v9, 0x80000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GFX11-NEXT:  .LBB12_8:
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v1|, |v5|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB12_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else16
+; GFX11-NEXT:    v_bfi_b32 v9, 0x7fffffff, 0, v1
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB12_11
+; GFX11-NEXT:    s_branch .LBB12_16
+; GFX11-NEXT:  .LBB12_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr9
+; GFX11-NEXT:  .LBB12_11: ; %frem.compute15
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v10, |v5|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v9, |v1|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v12, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v10, v10, 1
+; GFX11-NEXT:    v_ldexp_f32 v11, v9, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v9, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v12
+; GFX11-NEXT:    v_div_scale_f32 v14, null, v10, v10, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, -1, v9
+; GFX11-NEXT:    v_rcp_f32_e32 v15, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v13, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v12
+; GFX11-NEXT:    v_div_scale_f32 v12, vcc_lo, 1.0, v10, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_fma_f32 v16, -v14, v15, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v11, v7, v10
-; GFX11-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX11-NEXT:    v_fmac_f32_e32 v15, v16, v15
+; GFX11-NEXT:    v_mul_f32_e32 v16, v12, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v10
-; GFX11-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX11-NEXT:    v_fma_f32 v17, -v14, v16, v12
+; GFX11-NEXT:    v_fmac_f32_e32 v16, v17, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v12, -v14, v16, v12
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v12, v12, v15, v16
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v12, v12, v10, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB12_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB12_13: ; %frem.loop_body23
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v14, v11
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
-; GFX11-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX11-NEXT:    v_fma_f32 v2, -v7, v6, v2
-; GFX11-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
-; GFX11-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v9, v7
+; GFX11-NEXT:    v_mul_f32_e32 v11, v14, v12
+; GFX11-NEXT:    v_rndne_f32_e32 v11, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v11, -v11, v10, v14
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v11
+; GFX11-NEXT:    v_add_f32_e32 v13, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v11, v11, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB12_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow130
+; GFX11-NEXT:    v_mov_b32_e32 v13, s2
+; GFX11-NEXT:    v_mov_b32_e32 v11, v14
+; GFX11-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, -11, v13
+; GFX11-NEXT:    v_ldexp_f32 v11, v11, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v12, v11, v12
+; GFX11-NEXT:    v_rndne_f32_e32 v12, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v11, -v12, v10, v11
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v11
+; GFX11-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v9, v10, v9
+; GFX11-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v9, v10, v9
+; GFX11-NEXT:  .LBB12_16:
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v2|, |v6|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB12_18
+; GFX11-NEXT:  ; %bb.17: ; %frem.else50
+; GFX11-NEXT:    v_bfi_b32 v10, 0x7fffffff, 0, v2
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB12_19
+; GFX11-NEXT:    s_branch .LBB12_24
+; GFX11-NEXT:  .LBB12_18:
+; GFX11-NEXT:    ; implicit-def: $vgpr10
+; GFX11-NEXT:  .LBB12_19: ; %frem.compute49
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v11, |v6|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v10, |v2|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v13, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v11, v11, 1
+; GFX11-NEXT:    v_ldexp_f32 v12, v10, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v10, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v13
+; GFX11-NEXT:    v_div_scale_f32 v15, null, v11, v11, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11-NEXT:    v_rcp_f32_e32 v16, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v14, v10
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, v14, v13
+; GFX11-NEXT:    v_div_scale_f32 v13, vcc_lo, 1.0, v11, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_fma_f32 v17, -v15, v16, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v10, v6, v9
-; GFX11-NEXT:    v_fma_f32 v11, -v7, v10, v6
+; GFX11-NEXT:    v_fmac_f32_e32 v16, v17, v16
+; GFX11-NEXT:    v_mul_f32_e32 v17, v13, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v9
-; GFX11-NEXT:    v_fma_f32 v6, -v7, v10, v6
+; GFX11-NEXT:    v_fma_f32 v18, -v15, v17, v13
+; GFX11-NEXT:    v_fmac_f32_e32 v17, v18, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v13, -v15, v17, v13
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v13, v13, v16, v17
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v13, v13, v11, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB12_23
+; GFX11-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB12_21: ; %frem.loop_body57
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v15, v12
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
-; GFX11-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
+; GFX11-NEXT:    v_mul_f32_e32 v12, v15, v13
+; GFX11-NEXT:    v_rndne_f32_e32 v12, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX11-NEXT:    v_fma_f32 v1, -v6, v5, v1
-; GFX11-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
-; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX11-NEXT:    v_fma_f32 v12, -v12, v11, v15
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v12
+; GFX11-NEXT:    v_add_f32_e32 v14, v12, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v12, v12, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB12_21
+; GFX11-NEXT:  ; %bb.22: ; %Flow126
+; GFX11-NEXT:    v_mov_b32_e32 v14, s2
+; GFX11-NEXT:    v_mov_b32_e32 v12, v15
+; GFX11-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v14, -11, v14
+; GFX11-NEXT:    v_ldexp_f32 v12, v12, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v13, v12, v13
+; GFX11-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v12, -v13, v11, v12
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v12
+; GFX11-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v10, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0x80000000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v10, v11, v10
+; GFX11-NEXT:  .LBB12_24:
+; GFX11-NEXT:    v_cmp_ngt_f32_e64 s2, |v3|, |v7|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB12_26
+; GFX11-NEXT:  ; %bb.25: ; %frem.else84
+; GFX11-NEXT:    v_bfi_b32 v11, 0x7fffffff, 0, v3
+; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB12_27
+; GFX11-NEXT:    s_branch .LBB12_32
+; GFX11-NEXT:  .LBB12_26:
+; GFX11-NEXT:    ; implicit-def: $vgpr11
+; GFX11-NEXT:  .LBB12_27: ; %frem.compute83
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v12, |v7|
+; GFX11-NEXT:    v_frexp_mant_f32_e64 v11, |v3|
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v14, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v12, v12, 1
+; GFX11-NEXT:    v_ldexp_f32 v13, v11, 12
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v11, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v14
+; GFX11-NEXT:    v_div_scale_f32 v16, null, v12, v12, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, -1, v11
+; GFX11-NEXT:    v_rcp_f32_e32 v17, v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v15, v11
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v14
+; GFX11-NEXT:    v_div_scale_f32 v14, vcc_lo, 1.0, v12, 1.0
 ; GFX11-NEXT:    s_denorm_mode 15
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v7
+; GFX11-NEXT:    v_fma_f32 v18, -v16, v17, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v9, v5, v7
-; GFX11-NEXT:    v_fma_f32 v10, -v6, v9, v5
+; GFX11-NEXT:    v_fmac_f32_e32 v17, v18, v17
+; GFX11-NEXT:    v_mul_f32_e32 v18, v14, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v7
-; GFX11-NEXT:    v_fma_f32 v5, -v6, v9, v5
+; GFX11-NEXT:    v_fma_f32 v19, -v16, v18, v14
+; GFX11-NEXT:    v_fmac_f32_e32 v18, v19, v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v14, -v16, v18, v14
 ; GFX11-NEXT:    s_denorm_mode 12
+; GFX11-NEXT:    v_div_fmas_f32 v14, v14, v17, v18
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f32 v14, v14, v12, 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB12_31
+; GFX11-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 12
+; GFX11-NEXT:  .LBB12_29: ; %frem.loop_body91
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v16, v13
+; GFX11-NEXT:    s_add_i32 s2, s2, -12
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v13, v16, v14
+; GFX11-NEXT:    v_rndne_f32_e32 v13, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v13, -v13, v12, v16
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v13
+; GFX11-NEXT:    v_add_f32_e32 v15, v13, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v13, v13, 12
+; GFX11-NEXT:    s_cbranch_scc1 .LBB12_29
+; GFX11-NEXT:  ; %bb.30: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v15, s2
+; GFX11-NEXT:    v_mov_b32_e32 v13, v16
+; GFX11-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v15, -11, v15
+; GFX11-NEXT:    v_ldexp_f32 v13, v13, v15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v14, v13, v14
+; GFX11-NEXT:    v_rndne_f32_e32 v14, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
-; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
+; GFX11-NEXT:    v_fma_f32 v13, -v14, v12, v13
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v13
+; GFX11-NEXT:    v_add_f32_e32 v12, v13, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    v_fma_f32 v0, -v5, v4, v0
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc_lo
+; GFX11-NEXT:    v_ldexp_f32 v11, v12, v11
+; GFX11-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v11, v12, v11
+; GFX11-NEXT:  .LBB12_32: ; %Flow125
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v4, 0x3fc
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v0, 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v1, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v5, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v2, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v6, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, v3, 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, v7, 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v11, vcc_lo
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v4f32:
 ; GFX1150:       ; %bb.0:
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v8, 0
+; GFX1150-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
-; GFX1150-NEXT:    global_load_b128 v[4:7], v8, s[4:5] offset:64
+; GFX1150-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1150-NEXT:    global_load_b128 v[1:4], v4, s[8:9] offset:64
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
-; GFX1150-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX1150-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX1150-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX1150-NEXT:    s_and_b32 s10, s8, 0x7fffffff
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, s10, v1
+; GFX1150-NEXT:    s_cbranch_vccz .LBB12_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, s10, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, v0, v2, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB12_3
+; GFX1150-NEXT:    s_branch .LBB12_8
+; GFX1150-NEXT:  .LBB12_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr1
+; GFX1150-NEXT:  .LBB12_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |s8|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v1, |v0|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v2, 1
+; GFX1150-NEXT:    v_ldexp_f32 v3, v1, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v1, s8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s9, v4
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v5, v1
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, v5, v4
+; GFX1150-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
 ; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v11
-; GFX1150-NEXT:    v_mul_f32_e32 v12, v9, v11
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v4, v7
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v13, -v10, v12, v9
-; GFX1150-NEXT:    v_fmac_f32_e32 v12, v13, v11
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v9, -v10, v12, v9
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1150-NEXT:    v_fma_f32 v4, -v6, v8, v4
 ; GFX1150-NEXT:    s_denorm_mode 12
-; GFX1150-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
+; GFX1150-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB12_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s9, s9, s10
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s9, s9, 12
+; GFX1150-NEXT:  .LBB12_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1150-NEXT:    s_add_i32 s9, s9, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s9, 12
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
-; GFX1150-NEXT:    v_trunc_f32_e32 v9, v9
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v3, v3
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
-; GFX1150-NEXT:    v_fma_f32 v3, v9, v7, v3
-; GFX1150-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
-; GFX1150-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v10, v9
-; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fma_f32 v3, v3, v2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB12_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow134
+; GFX1150-NEXT:    v_mov_b32_e32 v5, s9
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v6
+; GFX1150-NEXT:  .LBB12_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v5, -11, v5
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v10
-; GFX1150-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v12, -v9, v11, v7
-; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fmac_f32_e32 v3, v4, v2
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1150-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v1, v2, v1
+; GFX1150-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
+; GFX1150-NEXT:    v_xor_b32_e32 v1, v2, v1
+; GFX1150-NEXT:  .LBB12_8:
+; GFX1150-NEXT:    s_and_b32 s10, s6, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s7, 0x7fffffff
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s10, s11
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else16
+; GFX1150-NEXT:    s_cmp_eq_f32 s10, s11
+; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, s6
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB12_11
+; GFX1150-NEXT:    s_branch .LBB12_16
+; GFX1150-NEXT:  .LBB12_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr2
+; GFX1150-NEXT:  .LBB12_11: ; %frem.compute15
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v2, |s6|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v5, s6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v3, 1
+; GFX1150-NEXT:    v_ldexp_f32 v4, v2, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v2, s7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v6, v2
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v8
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v8
+; GFX1150-NEXT:    v_fma_f32 v5, -v7, v9, v5
 ; GFX1150-NEXT:    s_denorm_mode 12
-; GFX1150-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB12_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT:    s_sub_i32 s9, s9, s10
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s9, s9, 12
+; GFX1150-NEXT:  .LBB12_13: ; %frem.loop_body23
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1150-NEXT:    s_add_i32 s9, s9, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s9, 12
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
-; GFX1150-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v7, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX1150-NEXT:    v_fma_f32 v2, v7, v6, v2
-; GFX1150-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
-; GFX1150-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v9, v7
-; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
+; GFX1150-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB12_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow130
+; GFX1150-NEXT:    v_mov_b32_e32 v6, s9
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v7
+; GFX1150-NEXT:  .LBB12_15: ; %frem.loop_exit24
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v6, -11, v6
+; GFX1150-NEXT:    s_and_b32 s9, s6, 0x80000000
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v2, v3, v2
+; GFX1150-NEXT:    v_xor_b32_e32 v2, s9, v2
+; GFX1150-NEXT:  .LBB12_16:
+; GFX1150-NEXT:    s_and_b32 s10, s4, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s5, 0x7fffffff
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s10, s11
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_18
+; GFX1150-NEXT:  ; %bb.17: ; %frem.else50
+; GFX1150-NEXT:    s_cmp_eq_f32 s10, s11
+; GFX1150-NEXT:    v_bfi_b32 v3, 0x7fffffff, 0, s4
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, s4, v3, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB12_19
+; GFX1150-NEXT:    s_branch .LBB12_24
+; GFX1150-NEXT:  .LBB12_18:
+; GFX1150-NEXT:    ; implicit-def: $vgpr3
+; GFX1150-NEXT:  .LBB12_19: ; %frem.compute49
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v4, |s5|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v3, |s4|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v6, s4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v4, 1
+; GFX1150-NEXT:    v_ldexp_f32 v5, v3, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v3, s5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s9, v6
+; GFX1150-NEXT:    v_div_scale_f32 v8, null, v4, v4, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX1150-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX1150-NEXT:    v_rcp_f32_e32 v9, v8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v7, v3
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX1150-NEXT:    v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
 ; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_mul_f32_e32 v10, v6, v9
+; GFX1150-NEXT:    v_fma_f32 v11, -v8, v10, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v11, -v7, v10, v6
 ; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v9
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v6, -v7, v10, v6
+; GFX1150-NEXT:    v_fma_f32 v6, -v8, v10, v6
 ; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1150-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
+; GFX1150-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB12_23
+; GFX1150-NEXT:  ; %bb.20: ; %frem.loop_body57.preheader
+; GFX1150-NEXT:    s_sub_i32 s9, s9, s10
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s9, s9, 12
+; GFX1150-NEXT:  .LBB12_21: ; %frem.loop_body57
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v8, v5
+; GFX1150-NEXT:    s_add_i32 s9, s9, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s9, 12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v8, v6
+; GFX1150-NEXT:    v_rndne_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fma_f32 v5, v5, v4, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v7, v5, v4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB12_21
+; GFX1150-NEXT:  ; %bb.22: ; %Flow126
+; GFX1150-NEXT:    v_mov_b32_e32 v7, s9
+; GFX1150-NEXT:    v_mov_b32_e32 v5, v8
+; GFX1150-NEXT:  .LBB12_23: ; %frem.loop_exit58
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v7, -11, v7
+; GFX1150-NEXT:    s_and_b32 s9, s4, 0x80000000
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
-; GFX1150-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX1150-NEXT:    v_rndne_f32_e32 v6, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX1150-NEXT:    v_fma_f32 v1, v6, v5, v1
-; GFX1150-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
-; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v3, v4, v3
+; GFX1150-NEXT:    v_xor_b32_e32 v3, s9, v3
+; GFX1150-NEXT:  .LBB12_24:
+; GFX1150-NEXT:    s_and_b32 s10, s2, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s3, 0x7fffffff
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_cmp_ngt_f32 s10, s11
+; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_26
+; GFX1150-NEXT:  ; %bb.25: ; %frem.else84
+; GFX1150-NEXT:    s_cmp_eq_f32 s10, s11
+; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, s2
+; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v4, s2, v4, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB12_27
+; GFX1150-NEXT:    s_branch .LBB12_32
+; GFX1150-NEXT:  .LBB12_26:
+; GFX1150-NEXT:    ; implicit-def: $vgpr4
+; GFX1150-NEXT:  .LBB12_27: ; %frem.compute83
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v5, |s3|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v4, |s2|
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v7, s2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_ldexp_f32 v5, v5, 1
+; GFX1150-NEXT:    v_ldexp_f32 v6, v4, 12
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v4, s3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_readfirstlane_b32 s9, v7
+; GFX1150-NEXT:    v_div_scale_f32 v9, null, v5, v5, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX1150-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX1150-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v8, v4
+; GFX1150-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX1150-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
 ; GFX1150-NEXT:    s_denorm_mode 15
-; GFX1150-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v10
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v7, v9, v7
-; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX1150-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX1150-NEXT:    v_fma_f32 v12, -v9, v11, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v10, -v6, v9, v5
-; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f32 v5, -v6, v9, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX1150-NEXT:    v_fma_f32 v7, -v9, v11, v7
 ; GFX1150-NEXT:    s_denorm_mode 12
-; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v8
+; GFX1150-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB12_31
+; GFX1150-NEXT:  ; %bb.28: ; %frem.loop_body91.preheader
+; GFX1150-NEXT:    s_sub_i32 s9, s9, s10
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s9, s9, 12
+; GFX1150-NEXT:  .LBB12_29: ; %frem.loop_body91
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1150-NEXT:    s_add_i32 s9, s9, -12
+; GFX1150-NEXT:    s_cmp_gt_i32 s9, 12
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
-; GFX1150-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX1150-NEXT:    v_rndne_f32_e32 v6, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX1150-NEXT:    v_fmac_f32_e32 v0, v5, v4
-; GFX1150-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1150-NEXT:    v_fma_f32 v6, v6, v5, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX1150-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX1150-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v6, v6, 12
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB12_29
+; GFX1150-NEXT:  ; %bb.30: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v8, s9
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v9
+; GFX1150-NEXT:  .LBB12_31: ; %frem.loop_exit92
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_nc_u32_e32 v8, -11, v8
+; GFX1150-NEXT:    s_and_b32 s9, s2, 0x80000000
+; GFX1150-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX1150-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1150-NEXT:    v_xor_b32_e32 v4, s9, v4
+; GFX1150-NEXT:  .LBB12_32: ; %Flow125
+; GFX1150-NEXT:    s_cmp_neq_f32 s8, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s8, s8, 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s10, v0, 0x1f8
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s6, s6, 0x1f8
+; GFX1150-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s4, s4, 0x1f8
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX1150-NEXT:    s_and_b32 s8, s8, s10
+; GFX1150-NEXT:    v_mov_b32_e32 v5, 0
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s8, s9
+; GFX1150-NEXT:    s_cmp_neq_f32 s7, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s7, s7, 0x3fc
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
+; GFX1150-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s6, s7, s6
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s6, s8
+; GFX1150-NEXT:    s_cmp_neq_f32 s5, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s5, s5, 0x3fc
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v2, vcc_lo
+; GFX1150-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s4, s5, s4
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s4, s6
+; GFX1150-NEXT:    s_cmp_neq_f32 s3, 0
+; GFX1150-NEXT:    v_cmp_class_f32_e64 s3, s3, 0x3fc
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v3, vcc_lo
+; GFX1150-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s3, s2
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, s4
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v4, vcc_lo
+; GFX1150-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
@@ -3865,230 +14330,792 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_v2f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_mov_b32 s0, s2
-; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s4, s10
+; SI-NEXT:    s_mov_b32 s5, s11
 ; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
-; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
-; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
-; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v13
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB13_2
+; SI-NEXT:  ; %bb.1: ; %frem.else
+; SI-NEXT:    v_and_b32_e32 v8, 0x80000000, v1
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
+; SI-NEXT:    v_cndmask_b32_e32 v9, v1, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB13_3
+; SI-NEXT:    s_branch .LBB13_8
+; SI-NEXT:  .LBB13_2:
+; SI-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB13_3: ; %frem.compute
+; SI-NEXT:    s_brev_b32 s5, -2
+; SI-NEXT:    v_and_b32_e32 v10, 0x7fffffff, v1
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    s_mov_b32 s1, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; SI-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v10, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v10
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; SI-NEXT:    v_and_b32_e32 v12, 0x7fffffff, v5
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[4:5]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; SI-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v8, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v12, v[4:5]
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v12
+; SI-NEXT:    s_cselect_b32 s7, s0, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s4, s7, -1
+; SI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
+; SI-NEXT:    s_not_b32 s0, s4
+; SI-NEXT:    s_add_i32 s6, s0, s3
+; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[8:9], v[8:9], 1.0
+; SI-NEXT:    v_rcp_f64_e32 v[14:15], v[12:13]
+; SI-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[14:15], 1.0
+; SI-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; SI-NEXT:    v_fma_f64 v[16:17], -v[12:13], v[14:15], 1.0
+; SI-NEXT:    v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15]
+; SI-NEXT:    v_div_scale_f64 v[16:17], s[0:1], 1.0, v[8:9], 1.0
+; SI-NEXT:    v_mul_f64 v[18:19], v[16:17], v[14:15]
+; SI-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], v[16:17]
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v13
+; SI-NEXT:    s_mov_b32 s0, 0x3ff00000
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, v17
 ; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-NEXT:    s_nop 1
-; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
-; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; SI-NEXT:    v_readfirstlane_b32 s8, v9
-; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
-; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
-; SI-NEXT:    v_not_b32_e32 v10, s0
-; SI-NEXT:    v_and_b32_e32 v10, v8, v10
-; SI-NEXT:    v_not_b32_e32 v11, s1
-; SI-NEXT:    v_and_b32_e32 v9, v9, v11
-; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s9, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v11, s0
-; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
-; SI-NEXT:    s_cmp_gt_i32 s9, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_div_fmas_f64 v[12:13], v[20:21], v[14:15], v[18:19]
+; SI-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; SI-NEXT:    s_cmp_lt_i32 s6, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB13_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT:    s_sub_i32 s0, s3, s7
+; SI-NEXT:    s_add_i32 s6, s0, 26
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_mov_b32_e32 v18, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v14, 0
+; SI-NEXT:  .LBB13_5: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v17, v11
+; SI-NEXT:    v_mov_b32_e32 v16, v10
+; SI-NEXT:    v_mul_f64 v[10:11], v[16:17], v[12:13]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[10:11]|, s[2:3]
+; SI-NEXT:    v_bfi_b32 v15, s5, v18, v11
+; SI-NEXT:    v_add_f64 v[19:20], v[10:11], v[14:15]
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], -v[14:15]
+; SI-NEXT:    v_cndmask_b32_e32 v11, v20, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
+; SI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[16:17]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; SI-NEXT:    v_add_f64 v[19:20], v[10:11], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v20, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v19, vcc
+; SI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; SI-NEXT:    s_sub_i32 s6, s6, 26
+; SI-NEXT:    s_cmp_gt_i32 s6, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB13_5
+; SI-NEXT:  ; %bb.6: ; %Flow54
+; SI-NEXT:    v_mov_b32_e32 v10, v16
+; SI-NEXT:    v_mov_b32_e32 v11, v17
+; SI-NEXT:  .LBB13_7: ; %frem.loop_exit
+; SI-NEXT:    s_sub_i32 s0, s6, 25
+; SI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], s0
+; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; SI-NEXT:    s_mov_b32 s0, -1
+; SI-NEXT:    s_mov_b32 s1, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[12:13]|, s[0:1]
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v14, 0x43300000
+; SI-NEXT:    v_bfi_b32 v15, s0, v14, v13
+; SI-NEXT:    v_mov_b32_e32 v14, 0
+; SI-NEXT:    v_add_f64 v[16:17], v[12:13], v[14:15]
+; SI-NEXT:    v_add_f64 v[14:15], v[16:17], -v[14:15]
+; SI-NEXT:    v_cndmask_b32_e32 v13, v15, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
+; SI-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; SI-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
-; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
-; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
-; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
-; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v11
+; SI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], s4
+; SI-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; SI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; SI-NEXT:  .LBB13_8:
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB13_10
+; SI-NEXT:  ; %bb.9: ; %frem.else16
+; SI-NEXT:    v_and_b32_e32 v10, 0x80000000, v3
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
+; SI-NEXT:    v_cndmask_b32_e32 v11, v3, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB13_11
+; SI-NEXT:    s_branch .LBB13_16
+; SI-NEXT:  .LBB13_10:
+; SI-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB13_11: ; %frem.compute15
+; SI-NEXT:    s_brev_b32 s5, -2
+; SI-NEXT:    v_and_b32_e32 v12, 0x7fffffff, v3
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    s_mov_b32 s1, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v12, v[2:3]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v12
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; SI-NEXT:    v_and_b32_e32 v14, 0x7fffffff, v7
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[6:7]|, s[0:1]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; SI-NEXT:    v_cndmask_b32_e32 v11, v14, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v10, v6, v10, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[6:7]
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s0, v14
+; SI-NEXT:    s_cselect_b32 s7, s0, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s4, s7, -1
+; SI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; SI-NEXT:    s_not_b32 s0, s4
+; SI-NEXT:    s_add_i32 s6, s0, s3
+; SI-NEXT:    v_div_scale_f64 v[14:15], s[0:1], v[10:11], v[10:11], 1.0
+; SI-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; SI-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[16:17], 1.0
+; SI-NEXT:    v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; SI-NEXT:    v_fma_f64 v[18:19], -v[14:15], v[16:17], 1.0
+; SI-NEXT:    v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; SI-NEXT:    v_div_scale_f64 v[18:19], s[0:1], 1.0, v[10:11], 1.0
+; SI-NEXT:    v_mul_f64 v[20:21], v[18:19], v[16:17]
+; SI-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], v[18:19]
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v15
+; SI-NEXT:    s_mov_b32 s0, 0x3ff00000
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, v19
 ; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
-; SI-NEXT:    s_nop 1
-; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
-; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s8, v7
-; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
-; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
-; SI-NEXT:    v_not_b32_e32 v8, s0
-; SI-NEXT:    v_and_b32_e32 v8, v6, v8
-; SI-NEXT:    v_not_b32_e32 v9, s1
-; SI-NEXT:    v_and_b32_e32 v7, v7, v9
-; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s9, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s0
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    s_cmp_gt_i32 s9, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v9, s8
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_div_fmas_f64 v[14:15], v[22:23], v[16:17], v[20:21]
+; SI-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; SI-NEXT:    s_cmp_lt_i32 s6, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB13_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT:    s_sub_i32 s0, s3, s7
+; SI-NEXT:    s_add_i32 s6, s0, 26
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_mov_b32_e32 v20, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v16, 0
+; SI-NEXT:  .LBB13_13: ; %frem.loop_body23
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v19, v13
+; SI-NEXT:    v_mov_b32_e32 v18, v12
+; SI-NEXT:    v_mul_f64 v[12:13], v[18:19], v[14:15]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[12:13]|, s[2:3]
+; SI-NEXT:    v_bfi_b32 v17, s5, v20, v13
+; SI-NEXT:    v_add_f64 v[21:22], v[12:13], v[16:17]
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], -v[16:17]
+; SI-NEXT:    v_cndmask_b32_e32 v13, v22, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v12, v21, v12, vcc
+; SI-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[18:19]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; SI-NEXT:    v_add_f64 v[21:22], v[12:13], v[10:11]
+; SI-NEXT:    v_cndmask_b32_e32 v13, v13, v22, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v12, v12, v21, vcc
+; SI-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; SI-NEXT:    s_sub_i32 s6, s6, 26
+; SI-NEXT:    s_cmp_gt_i32 s6, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB13_13
+; SI-NEXT:  ; %bb.14: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v12, v18
+; SI-NEXT:    v_mov_b32_e32 v13, v19
+; SI-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; SI-NEXT:    s_sub_i32 s0, s6, 25
+; SI-NEXT:    v_ldexp_f64 v[12:13], v[12:13], s0
+; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; SI-NEXT:    s_mov_b32 s0, -1
+; SI-NEXT:    s_mov_b32 s1, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[14:15]|, s[0:1]
+; SI-NEXT:    s_brev_b32 s0, -2
+; SI-NEXT:    v_mov_b32_e32 v16, 0x43300000
+; SI-NEXT:    v_bfi_b32 v17, s0, v16, v15
+; SI-NEXT:    v_mov_b32_e32 v16, 0
+; SI-NEXT:    v_add_f64 v[18:19], v[14:15], v[16:17]
+; SI-NEXT:    v_add_f64 v[16:17], v[18:19], -v[16:17]
+; SI-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
+; SI-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; SI-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; SI-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; SI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], s4
+; SI-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; SI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; SI-NEXT:  .LBB13_16: ; %Flow53
+; SI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; SI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], v12
+; SI-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; SI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], v12
+; SI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[2:3], v4
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: frem_v2f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    s_mov_b32 s6, s2
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_mov_b32 s0, s8
-; CI-NEXT:    s_mov_b32 s1, s9
-; CI-NEXT:    s_mov_b32 s8, s10
-; CI-NEXT:    s_mov_b32 s9, s11
-; CI-NEXT:    s_mov_b32 s10, s2
-; CI-NEXT:    s_mov_b32 s11, s3
-; CI-NEXT:    s_mov_b32 s7, s3
-; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64
+; CI-NEXT:    s_mov_b32 s4, s10
+; CI-NEXT:    s_mov_b32 s5, s11
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
-; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; CI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
-; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
-; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB13_2
+; CI-NEXT:  ; %bb.1: ; %frem.else
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
+; CI-NEXT:    v_and_b32_e32 v8, 0x80000000, v1
+; CI-NEXT:    v_cndmask_b32_e32 v9, v1, v8, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB13_3
+; CI-NEXT:    s_branch .LBB13_8
+; CI-NEXT:  .LBB13_2:
+; CI-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; CI-NEXT:  .LBB13_3: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v15, v[4:5]
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[0:1]
+; CI-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; CI-NEXT:    v_add_i32_e32 v16, vcc, -1, v15
+; CI-NEXT:    v_not_b32_e32 v12, v16
+; CI-NEXT:    v_add_i32_e32 v17, vcc, v12, v14
+; CI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
+; CI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[8:9], v[8:9], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[18:19], v[12:13]
+; CI-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; CI-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; CI-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; CI-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; CI-NEXT:    v_div_scale_f64 v[20:21], vcc, 1.0, v[8:9], 1.0
+; CI-NEXT:    v_mul_f64 v[22:23], v[20:21], v[18:19]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[22:23], v[20:21]
 ; CI-NEXT:    s_nop 1
-; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
-; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
-; CI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
-; CI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
-; CI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
-; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
-; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; CI-NEXT:    v_div_fmas_f64 v[12:13], v[12:13], v[18:19], v[22:23]
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v17
+; CI-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB13_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT:    v_sub_i32_e32 v14, vcc, v14, v15
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 26, v14
+; CI-NEXT:  .LBB13_5: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v15, v11
+; CI-NEXT:    v_mov_b32_e32 v14, v10
+; CI-NEXT:    v_mul_f64 v[10:11], v[14:15], v[12:13]
+; CI-NEXT:    v_rndne_f64_e32 v[10:11], v[10:11]
+; CI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; CI-NEXT:    v_add_f64 v[18:19], v[10:11], v[8:9]
+; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v10, v10, v18, vcc
+; CI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; CI-NEXT:    v_subrev_i32_e32 v17, vcc, 26, v17
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v17
+; CI-NEXT:    s_cbranch_vccnz .LBB13_5
+; CI-NEXT:  ; %bb.6: ; %Flow54
+; CI-NEXT:    v_mov_b32_e32 v10, v14
+; CI-NEXT:    v_mov_b32_e32 v11, v15
+; CI-NEXT:  .LBB13_7: ; %frem.loop_exit
+; CI-NEXT:    v_subrev_i32_e32 v14, vcc, 25, v17
+; CI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v14
+; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; CI-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; CI-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; CI-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; CI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; CI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v16
+; CI-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; CI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; CI-NEXT:  .LBB13_8:
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]|
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; CI-NEXT:    s_cbranch_vccz .LBB13_10
+; CI-NEXT:  ; %bb.9: ; %frem.else16
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
+; CI-NEXT:    v_and_b32_e32 v10, 0x80000000, v3
+; CI-NEXT:    v_cndmask_b32_e32 v11, v3, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB13_11
+; CI-NEXT:    s_branch .LBB13_16
+; CI-NEXT:  .LBB13_10:
+; CI-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CI-NEXT:  .LBB13_11: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v17, v[6:7]
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v16, v[2:3]
+; CI-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; CI-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; CI-NEXT:    v_add_i32_e32 v18, vcc, -1, v17
+; CI-NEXT:    v_not_b32_e32 v14, v18
+; CI-NEXT:    v_add_i32_e32 v19, vcc, v14, v16
+; CI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; CI-NEXT:    v_div_scale_f64 v[14:15], s[0:1], v[10:11], v[10:11], 1.0
+; CI-NEXT:    v_rcp_f64_e32 v[20:21], v[14:15]
+; CI-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0
+; CI-NEXT:    v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21]
+; CI-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0
+; CI-NEXT:    v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21]
+; CI-NEXT:    v_div_scale_f64 v[22:23], vcc, 1.0, v[10:11], 1.0
+; CI-NEXT:    v_mul_f64 v[24:25], v[22:23], v[20:21]
+; CI-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[24:25], v[22:23]
 ; CI-NEXT:    s_nop 1
-; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; CI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    v_div_fmas_f64 v[14:15], v[14:15], v[20:21], v[24:25]
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v19
+; CI-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; CI-NEXT:    s_cbranch_vccnz .LBB13_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
+; CI-NEXT:    v_add_i32_e32 v19, vcc, 26, v16
+; CI-NEXT:  .LBB13_13: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v17, v13
+; CI-NEXT:    v_mov_b32_e32 v16, v12
+; CI-NEXT:    v_mul_f64 v[12:13], v[16:17], v[14:15]
+; CI-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; CI-NEXT:    v_add_f64 v[20:21], v[12:13], v[10:11]
+; CI-NEXT:    v_cndmask_b32_e32 v13, v13, v21, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v12, v12, v20, vcc
+; CI-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; CI-NEXT:    v_subrev_i32_e32 v19, vcc, 26, v19
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v19
+; CI-NEXT:    s_cbranch_vccnz .LBB13_13
+; CI-NEXT:  ; %bb.14: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v12, v16
+; CI-NEXT:    v_mov_b32_e32 v13, v17
+; CI-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT:    v_subrev_i32_e32 v16, vcc, 25, v19
+; CI-NEXT:    v_ldexp_f64 v[12:13], v[12:13], v16
+; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; CI-NEXT:    v_rndne_f64_e32 v[14:15], v[14:15]
+; CI-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; CI-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; CI-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; CI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v18
+; CI-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; CI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; CI-NEXT:  .LBB13_16: ; %Flow53
+; CI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; CI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], v12
+; CI-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v4
+; CI-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], v12
+; CI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[2:3], v4
+; CI-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc
+; CI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; CI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; CI-NEXT:    v_cndmask_b32_e32 v3, v5, v11, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: frem_v2f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v8, s0
-; VI-NEXT:    s_add_u32 s0, s4, 64
-; VI-NEXT:    v_mov_b32_e32 v9, s1
-; VI-NEXT:    s_addc_u32 s1, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_add_u32 s0, s0, 64
+; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v1, s11
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
-; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
-; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
-; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
-; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
-; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
-; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
-; VI-NEXT:    v_mul_f64 v[16:17], v[14:15], v[12:13]
-; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB13_2
+; VI-NEXT:  ; %bb.1: ; %frem.else
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
+; VI-NEXT:    v_and_b32_e32 v8, 0x80000000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v9, v1, v8, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB13_3
+; VI-NEXT:    s_branch .LBB13_8
+; VI-NEXT:  .LBB13_2:
+; VI-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; VI-NEXT:  .LBB13_3: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v15, v[4:5]
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[0:1]
+; VI-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; VI-NEXT:    v_add_u32_e32 v16, vcc, -1, v15
+; VI-NEXT:    v_not_b32_e32 v12, v16
+; VI-NEXT:    v_add_u32_e32 v17, vcc, v12, v14
+; VI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
+; VI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[8:9], v[8:9], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[18:19], v[12:13]
+; VI-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; VI-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; VI-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; VI-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; VI-NEXT:    v_div_scale_f64 v[20:21], vcc, 1.0, v[8:9], 1.0
+; VI-NEXT:    v_mul_f64 v[22:23], v[20:21], v[18:19]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[22:23], v[20:21]
 ; VI-NEXT:    s_nop 1
-; VI-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
-; VI-NEXT:    v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
-; VI-NEXT:    v_trunc_f64_e32 v[10:11], v[10:11]
-; VI-NEXT:    v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
-; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
-; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
-; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
-; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
-; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
-; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
-; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
+; VI-NEXT:    v_div_fmas_f64 v[12:13], v[12:13], v[18:19], v[22:23]
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v17
+; VI-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB13_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT:    v_sub_u32_e32 v14, vcc, v14, v15
+; VI-NEXT:    v_add_u32_e32 v17, vcc, 26, v14
+; VI-NEXT:  .LBB13_5: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v15, v11
+; VI-NEXT:    v_mov_b32_e32 v14, v10
+; VI-NEXT:    v_mul_f64 v[10:11], v[14:15], v[12:13]
+; VI-NEXT:    v_rndne_f64_e32 v[10:11], v[10:11]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; VI-NEXT:    v_add_f64 v[18:19], v[10:11], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v18, vcc
+; VI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; VI-NEXT:    v_subrev_u32_e32 v17, vcc, 26, v17
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v17
+; VI-NEXT:    s_cbranch_vccnz .LBB13_5
+; VI-NEXT:  ; %bb.6: ; %Flow54
+; VI-NEXT:    v_mov_b32_e32 v10, v14
+; VI-NEXT:    v_mov_b32_e32 v11, v15
+; VI-NEXT:  .LBB13_7: ; %frem.loop_exit
+; VI-NEXT:    v_subrev_u32_e32 v14, vcc, 25, v17
+; VI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v14
+; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; VI-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; VI-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; VI-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; VI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; VI-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v16
+; VI-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; VI-NEXT:    v_xor_b32_e32 v9, v10, v9
+; VI-NEXT:  .LBB13_8:
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]|
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; VI-NEXT:    s_cbranch_vccz .LBB13_10
+; VI-NEXT:  ; %bb.9: ; %frem.else16
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
+; VI-NEXT:    v_and_b32_e32 v10, 0x80000000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB13_11
+; VI-NEXT:    s_branch .LBB13_16
+; VI-NEXT:  .LBB13_10:
+; VI-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; VI-NEXT:  .LBB13_11: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v17, v[6:7]
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v16, v[2:3]
+; VI-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; VI-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; VI-NEXT:    v_add_u32_e32 v18, vcc, -1, v17
+; VI-NEXT:    v_not_b32_e32 v14, v18
+; VI-NEXT:    v_add_u32_e32 v19, vcc, v14, v16
+; VI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; VI-NEXT:    v_div_scale_f64 v[14:15], s[0:1], v[10:11], v[10:11], 1.0
+; VI-NEXT:    v_rcp_f64_e32 v[20:21], v[14:15]
+; VI-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0
+; VI-NEXT:    v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21]
+; VI-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0
+; VI-NEXT:    v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21]
+; VI-NEXT:    v_div_scale_f64 v[22:23], vcc, 1.0, v[10:11], 1.0
+; VI-NEXT:    v_mul_f64 v[24:25], v[22:23], v[20:21]
+; VI-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[24:25], v[22:23]
 ; VI-NEXT:    s_nop 1
-; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
-; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; VI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT:    v_div_fmas_f64 v[14:15], v[14:15], v[20:21], v[24:25]
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v19
+; VI-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; VI-NEXT:    s_cbranch_vccnz .LBB13_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_sub_u32_e32 v16, vcc, v16, v17
+; VI-NEXT:    v_add_u32_e32 v19, vcc, 26, v16
+; VI-NEXT:  .LBB13_13: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v17, v13
+; VI-NEXT:    v_mov_b32_e32 v16, v12
+; VI-NEXT:    v_mul_f64 v[12:13], v[16:17], v[14:15]
+; VI-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; VI-NEXT:    v_add_f64 v[20:21], v[12:13], v[10:11]
+; VI-NEXT:    v_cndmask_b32_e32 v13, v13, v21, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v12, v12, v20, vcc
+; VI-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; VI-NEXT:    v_subrev_u32_e32 v19, vcc, 26, v19
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v19
+; VI-NEXT:    s_cbranch_vccnz .LBB13_13
+; VI-NEXT:  ; %bb.14: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v12, v16
+; VI-NEXT:    v_mov_b32_e32 v13, v17
+; VI-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT:    v_subrev_u32_e32 v16, vcc, 25, v19
+; VI-NEXT:    v_ldexp_f64 v[12:13], v[12:13], v16
+; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; VI-NEXT:    v_rndne_f64_e32 v[14:15], v[14:15]
+; VI-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; VI-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; VI-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; VI-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v18
+; VI-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; VI-NEXT:    v_xor_b32_e32 v11, v12, v11
+; VI-NEXT:  .LBB13_16: ; %Flow53
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; VI-NEXT:    v_mov_b32_e32 v13, 0x1f8
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], v12
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v13
+; VI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v14, 0x7ff80000
+; VI-NEXT:    v_mov_b32_e32 v4, s8
+; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], v12
+; VI-NEXT:    v_cmp_class_f64_e64 s[2:3], v[2:3], v13
+; VI-NEXT:    v_cndmask_b32_e32 v1, v14, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc
+; VI-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v14, v11, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3]
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[10:11]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[2:3], v[6:7], v[6:7], v[2:3]
-; GFX9-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
-; GFX9-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
-; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB13_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x80000000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB13_3
+; GFX9-NEXT:    s_branch .LBB13_8
+; GFX9-NEXT:  .LBB13_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX9-NEXT:  .LBB13_3: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v15, v[4:5]
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; GFX9-NEXT:    v_add_u32_e32 v16, -1, v15
+; GFX9-NEXT:    v_not_b32_e32 v12, v16
+; GFX9-NEXT:    v_add_u32_e32 v17, v12, v14
+; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
+; GFX9-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[8:9], v[8:9], 1.0
+; GFX9-NEXT:    v_rcp_f64_e32 v[18:19], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; GFX9-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; GFX9-NEXT:    v_fma_f64 v[20:21], -v[12:13], v[18:19], 1.0
+; GFX9-NEXT:    v_fma_f64 v[18:19], v[18:19], v[20:21], v[18:19]
+; GFX9-NEXT:    v_div_scale_f64 v[20:21], vcc, 1.0, v[8:9], 1.0
+; GFX9-NEXT:    v_mul_f64 v[22:23], v[20:21], v[18:19]
+; GFX9-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[22:23], v[20:21]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
-; GFX9-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; GFX9-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
-; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[2:3], v[4:5], v[4:5], v[0:1]
-; GFX9-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
-; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX9-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
-; GFX9-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX9-NEXT:    v_div_fmas_f64 v[12:13], v[12:13], v[18:19], v[22:23]
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v17
+; GFX9-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB13_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v14, v14, v15
+; GFX9-NEXT:    v_add_u32_e32 v17, 26, v14
+; GFX9-NEXT:  .LBB13_5: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v11
+; GFX9-NEXT:    v_mov_b32_e32 v14, v10
+; GFX9-NEXT:    v_mul_f64 v[10:11], v[14:15], v[12:13]
+; GFX9-NEXT:    v_subrev_u32_e32 v17, 26, v17
+; GFX9-NEXT:    v_rndne_f64_e32 v[10:11], v[10:11]
+; GFX9-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_add_f64 v[18:19], v[10:11], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v18, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v17
+; GFX9-NEXT:    s_cbranch_vccnz .LBB13_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow54
+; GFX9-NEXT:    v_mov_b32_e32 v10, v14
+; GFX9-NEXT:    v_mov_b32_e32 v11, v15
+; GFX9-NEXT:  .LBB13_7: ; %frem.loop_exit
+; GFX9-NEXT:    v_subrev_u32_e32 v14, 25, v17
+; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX9-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; GFX9-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v16
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x80000000, v1
+; GFX9-NEXT:    v_xor_b32_e32 v9, v10, v9
+; GFX9-NEXT:  .LBB13_8:
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]|
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_vccz .LBB13_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else16
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB13_11
+; GFX9-NEXT:    s_branch .LBB13_16
+; GFX9-NEXT:  .LBB13_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX9-NEXT:  .LBB13_11: ; %frem.compute15
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v17, v[6:7]
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v16, v[2:3]
+; GFX9-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; GFX9-NEXT:    v_add_u32_e32 v18, -1, v17
+; GFX9-NEXT:    v_not_b32_e32 v14, v18
+; GFX9-NEXT:    v_add_u32_e32 v19, v14, v16
+; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; GFX9-NEXT:    v_div_scale_f64 v[14:15], s[0:1], v[10:11], v[10:11], 1.0
+; GFX9-NEXT:    v_rcp_f64_e32 v[20:21], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0
+; GFX9-NEXT:    v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21]
+; GFX9-NEXT:    v_fma_f64 v[22:23], -v[14:15], v[20:21], 1.0
+; GFX9-NEXT:    v_fma_f64 v[20:21], v[20:21], v[22:23], v[20:21]
+; GFX9-NEXT:    v_div_scale_f64 v[22:23], vcc, 1.0, v[10:11], 1.0
+; GFX9-NEXT:    v_mul_f64 v[24:25], v[22:23], v[20:21]
+; GFX9-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[24:25], v[22:23]
 ; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; GFX9-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; GFX9-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; GFX9-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; GFX9-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-NEXT:    v_div_fmas_f64 v[14:15], v[14:15], v[20:21], v[24:25]
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v19
+; GFX9-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; GFX9-NEXT:    s_cbranch_vccnz .LBB13_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT:    v_sub_u32_e32 v16, v16, v17
+; GFX9-NEXT:    v_add_u32_e32 v19, 26, v16
+; GFX9-NEXT:  .LBB13_13: ; %frem.loop_body23
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v17, v13
+; GFX9-NEXT:    v_mov_b32_e32 v16, v12
+; GFX9-NEXT:    v_mul_f64 v[12:13], v[16:17], v[14:15]
+; GFX9-NEXT:    v_subrev_u32_e32 v19, 26, v19
+; GFX9-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT:    v_add_f64 v[20:21], v[12:13], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v21, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v20, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v19
+; GFX9-NEXT:    s_cbranch_vccnz .LBB13_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v12, v16
+; GFX9-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; GFX9-NEXT:    v_subrev_u32_e32 v16, 25, v19
+; GFX9-NEXT:    v_ldexp_f64 v[12:13], v[12:13], v16
+; GFX9-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; GFX9-NEXT:    v_rndne_f64_e32 v[14:15], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v18
+; GFX9-NEXT:    v_and_b32_e32 v12, 0x80000000, v3
+; GFX9-NEXT:    v_xor_b32_e32 v11, v12, v11
+; GFX9-NEXT:  .LBB13_16: ; %Flow53
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x3fc
+; GFX9-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[0:1], v[4:5], v12
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x1f8
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[2:3], v[0:1], v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7ff80000
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[0:1], v[6:7], v12
+; GFX9-NEXT:    v_cmp_class_f64_e64 s[2:3], v[2:3], v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc
+; GFX9-NEXT:    v_cmp_neq_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: frem_v2f64:
@@ -4096,39 +15123,173 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v16, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[2:3]
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7] offset:64
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0x80000000, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_div_scale_f64 v[8:9], s2, v[6:7], v[6:7], v[2:3]
-; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
-; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
-; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
-; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
-; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; GFX10-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
-; GFX10-NEXT:    v_div_scale_f64 v[6:7], s2, v[4:5], v[4:5], v[0:1]
-; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
-; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
-; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; GFX10-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB13_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v1, v16, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB13_3
+; GFX10-NEXT:    s_branch .LBB13_8
+; GFX10-NEXT:  .LBB13_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX10-NEXT:  .LBB13_3: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v13, v[4:5]
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v12, v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; GFX10-NEXT:    v_add_nc_u32_e32 v17, -1, v13
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v13
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v12
+; GFX10-NEXT:    v_not_b32_e32 v13, v17
+; GFX10-NEXT:    v_add_nc_u32_e32 v18, v13, v12
+; GFX10-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
+; GFX10-NEXT:    v_div_scale_f64 v[12:13], s4, v[8:9], v[8:9], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[14:15], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[19:20], -v[12:13], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[19:20], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[19:20], -v[12:13], v[14:15], 1.0
+; GFX10-NEXT:    v_fma_f64 v[14:15], v[14:15], v[19:20], v[14:15]
+; GFX10-NEXT:    v_div_scale_f64 v[19:20], vcc_lo, 1.0, v[8:9], 1.0
+; GFX10-NEXT:    v_mul_f64 v[21:22], v[19:20], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[21:22], v[19:20]
+; GFX10-NEXT:    v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[21:22]
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v18
+; GFX10-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB13_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 26
+; GFX10-NEXT:  .LBB13_5: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v11
+; GFX10-NEXT:    v_mov_b32_e32 v14, v10
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_mul_f64 v[10:11], v[14:15], v[12:13]
+; GFX10-NEXT:    v_rndne_f64_e32 v[10:11], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11]
+; GFX10-NEXT:    v_add_f64 v[18:19], v[10:11], v[8:9]
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v18, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB13_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow54
+; GFX10-NEXT:    v_mov_b32_e32 v10, v14
+; GFX10-NEXT:    v_mov_b32_e32 v18, s2
+; GFX10-NEXT:    v_mov_b32_e32 v11, v15
+; GFX10-NEXT:  .LBB13_7: ; %frem.loop_exit
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v14, 25, v18
+; GFX10-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; GFX10-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11]
+; GFX10-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v17
+; GFX10-NEXT:    v_xor_b32_e32 v9, v16, v9
+; GFX10-NEXT:  .LBB13_8:
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
+; GFX10-NEXT:    v_and_b32_e32 v18, 0x80000000, v3
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB13_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else16
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v18, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB13_11
+; GFX10-NEXT:    s_branch .LBB13_16
+; GFX10-NEXT:  .LBB13_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX10-NEXT:  .LBB13_11: ; %frem.compute15
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v15, v[6:7]
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[2:3]
+; GFX10-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; GFX10-NEXT:    v_add_nc_u32_e32 v19, -1, v15
+; GFX10-NEXT:    v_readfirstlane_b32 s3, v15
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v14
+; GFX10-NEXT:    v_not_b32_e32 v15, v19
+; GFX10-NEXT:    v_add_nc_u32_e32 v20, v15, v14
+; GFX10-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; GFX10-NEXT:    v_div_scale_f64 v[14:15], s4, v[10:11], v[10:11], 1.0
+; GFX10-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[21:22], -v[14:15], v[16:17], 1.0
+; GFX10-NEXT:    v_fma_f64 v[16:17], v[16:17], v[21:22], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[21:22], -v[14:15], v[16:17], 1.0
+; GFX10-NEXT:    v_fma_f64 v[16:17], v[16:17], v[21:22], v[16:17]
+; GFX10-NEXT:    v_div_scale_f64 v[21:22], vcc_lo, 1.0, v[10:11], 1.0
+; GFX10-NEXT:    v_mul_f64 v[23:24], v[21:22], v[16:17]
+; GFX10-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[23:24], v[21:22]
+; GFX10-NEXT:    v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[23:24]
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v20
+; GFX10-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; GFX10-NEXT:    s_cbranch_vccnz .LBB13_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT:    s_sub_i32 s2, s2, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, 26
+; GFX10-NEXT:  .LBB13_13: ; %frem.loop_body23
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v17, v13
+; GFX10-NEXT:    v_mov_b32_e32 v16, v12
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_mul_f64 v[12:13], v[16:17], v[14:15]
+; GFX10-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13]
+; GFX10-NEXT:    v_add_f64 v[20:21], v[12:13], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, v13, v21, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v12, v20, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB13_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v12, v16
+; GFX10-NEXT:    v_mov_b32_e32 v20, s2
+; GFX10-NEXT:    v_mov_b32_e32 v13, v17
+; GFX10-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v16, 25, v20
+; GFX10-NEXT:    v_ldexp_f64 v[12:13], v[12:13], v16
+; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; GFX10-NEXT:    v_rndne_f64_e32 v[14:15], v[14:15]
+; GFX10-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13]
+; GFX10-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v19
+; GFX10-NEXT:    v_xor_b32_e32 v11, v18, v11
+; GFX10-NEXT:  .LBB13_16: ; %Flow53
+; GFX10-NEXT:    v_cmp_class_f64_e64 s2, v[4:5], 0x3fc
+; GFX10-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX10-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    v_cmp_class_f64_e64 s3, v[2:3], 0x1f8
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 0x3fc
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    s_and_b32 s2, s2, s3
+; GFX10-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc_lo
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: frem_v2f64:
@@ -4136,51 +15297,209 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v16, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
-; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[4:5] offset:64
+; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[4:5] offset:64
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v16, 0x80000000, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB13_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
+; GFX11-NEXT:    v_cndmask_b32_e32 v9, v1, v16, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB13_3
+; GFX11-NEXT:    s_branch .LBB13_8
+; GFX11-NEXT:  .LBB13_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX11-NEXT:  .LBB13_3: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v13, v[4:5]
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v12, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; GFX11-NEXT:    v_add_nc_u32_e32 v17, -1, v13
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v13
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v13, v17
+; GFX11-NEXT:    v_add_nc_u32_e32 v18, v13, v12
+; GFX11-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f64 v[12:13], null, v[8:9], v[8:9], 1.0
+; GFX11-NEXT:    v_rcp_f64_e32 v[14:15], v[12:13]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX11-NEXT:    v_fma_f64 v[19:20], -v[12:13], v[14:15], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX11-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
+; GFX11-NEXT:    v_fma_f64 v[14:15], v[14:15], v[19:20], v[14:15]
+; GFX11-NEXT:    v_fma_f64 v[19:20], -v[12:13], v[14:15], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[14:15], v[14:15], v[19:20], v[14:15]
+; GFX11-NEXT:    v_div_scale_f64 v[19:20], vcc_lo, 1.0, v[8:9], 1.0
+; GFX11-NEXT:    v_mul_f64 v[21:22], v[19:20], v[14:15]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
-; GFX11-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; GFX11-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[21:22], v[19:20]
+; GFX11-NEXT:    v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[21:22]
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB13_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 26
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB13_5: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
-; GFX11-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
+; GFX11-NEXT:    v_mul_f64 v[10:11], v[14:15], v[12:13]
+; GFX11-NEXT:    v_rndne_f64_e32 v[10:11], v[10:11]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
-; GFX11-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
-; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
+; GFX11-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11]
+; GFX11-NEXT:    v_add_f64 v[18:19], v[10:11], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v11, v11, v19 :: v_dual_cndmask_b32 v10, v10, v18
+; GFX11-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB13_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow54
+; GFX11-NEXT:    v_mov_b32_e32 v10, v14
+; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v11, v15
+; GFX11-NEXT:  .LBB13_7: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v14, 25, v18
+; GFX11-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; GFX11-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11]
+; GFX11-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_cndmask_b32 v8, v10, v8
+; GFX11-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v9, v16, v9
+; GFX11-NEXT:  .LBB13_8:
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
+; GFX11-NEXT:    v_and_b32_e32 v18, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB13_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else16
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v11, v3, v18, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB13_11
+; GFX11-NEXT:    s_branch .LBB13_16
+; GFX11-NEXT:  .LBB13_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX11-NEXT:  .LBB13_11: ; %frem.compute15
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v15, v[6:7]
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; GFX11-NEXT:    v_add_nc_u32_e32 v19, -1, v15
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v15
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_not_b32_e32 v15, v19
+; GFX11-NEXT:    v_add_nc_u32_e32 v20, v15, v14
+; GFX11-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f64 v[14:15], null, v[10:11], v[10:11], 1.0
+; GFX11-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX11-NEXT:    v_fma_f64 v[21:22], -v[14:15], v[16:17], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX11-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX11-NEXT:    v_fma_f64 v[16:17], v[16:17], v[21:22], v[16:17]
+; GFX11-NEXT:    v_fma_f64 v[21:22], -v[14:15], v[16:17], 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[16:17], v[16:17], v[21:22], v[16:17]
+; GFX11-NEXT:    v_div_scale_f64 v[21:22], vcc_lo, 1.0, v[10:11], 1.0
+; GFX11-NEXT:    v_mul_f64 v[23:24], v[21:22], v[16:17]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; GFX11-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX11-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[23:24], v[21:22]
+; GFX11-NEXT:    v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[23:24]
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB13_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT:    s_sub_i32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s2, 26
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB13_13: ; %frem.loop_body23
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; GFX11-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[12:13], v[16:17], v[14:15]
+; GFX11-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; GFX11-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX11-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13]
+; GFX11-NEXT:    v_add_f64 v[20:21], v[12:13], v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v13, v13, v21 :: v_dual_cndmask_b32 v12, v12, v20
+; GFX11-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB13_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v12, v16
+; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v13, v17
+; GFX11-NEXT:  .LBB13_15: ; %frem.loop_exit24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v16, 25, v20
+; GFX11-NEXT:    v_ldexp_f64 v[12:13], v[12:13], v16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; GFX11-NEXT:    v_rndne_f64_e32 v[14:15], v[14:15]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13]
+; GFX11-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v11, v13, v11 :: v_dual_cndmask_b32 v10, v12, v10
+; GFX11-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v11, v18, v11
+; GFX11-NEXT:  .LBB13_16: ; %Flow53
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[4:5], 0x3fc
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX11-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    v_cmp_class_f64_e64 s3, v[2:3], 0x1f8
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 0x3fc
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc_lo
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f64:
@@ -4188,50 +15507,207 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v16, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
-; GFX1150-NEXT:    global_load_b128 v[4:7], v16, s[4:5] offset:64
+; GFX1150-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
+; GFX1150-NEXT:    global_load_b128 v[4:7], v4, s[4:5] offset:64
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_and_b32_e32 v16, 0x80000000, v1
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
-; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB13_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
+; GFX1150-NEXT:    v_cndmask_b32_e32 v9, v1, v16, vcc_lo
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB13_3
+; GFX1150-NEXT:    s_branch .LBB13_8
+; GFX1150-NEXT:  .LBB13_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX1150-NEXT:  .LBB13_3: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v13, v[4:5]
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v12, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f64 v[10:11], v[8:9], 26
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[8:9], |v[4:5]|
+; GFX1150-NEXT:    v_add_nc_u32_e32 v17, -1, v13
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v13
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v13, v17
+; GFX1150-NEXT:    v_add_nc_u32_e32 v18, v13, v12
+; GFX1150-NEXT:    v_ldexp_f64 v[8:9], v[8:9], 1
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
-; GFX1150-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
-; GFX1150-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
+; GFX1150-NEXT:    v_div_scale_f64 v[12:13], null, v[8:9], v[8:9], 1.0
+; GFX1150-NEXT:    v_rcp_f64_e32 v[14:15], v[12:13]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[19:20], -v[12:13], v[14:15], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[14:15], v[14:15], v[19:20], v[14:15]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
-; GFX1150-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
+; GFX1150-NEXT:    v_fma_f64 v[19:20], -v[12:13], v[14:15], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[14:15], v[14:15], v[19:20], v[14:15]
+; GFX1150-NEXT:    v_div_scale_f64 v[19:20], vcc_lo, 1.0, v[8:9], 1.0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; GFX1150-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
-; GFX1150-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
-; GFX1150-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
+; GFX1150-NEXT:    v_mul_f64 v[21:22], v[19:20], v[14:15]
+; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[21:22], v[19:20]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f64 v[12:13], v[12:13], v[14:15], v[21:22]
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v18
+; GFX1150-NEXT:    v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB13_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 26
+; GFX1150-NEXT:    .p2align 6
+; GFX1150-NEXT:  .LBB13_5: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[10:11], v[14:15], v[12:13]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[10:11], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[8:9], v[14:15]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11]
+; GFX1150-NEXT:    v_add_f64 v[18:19], v[10:11], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v11, v11, v19 :: v_dual_cndmask_b32 v10, v10, v18
+; GFX1150-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB13_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow54
+; GFX1150-NEXT:    v_mov_b32_e32 v10, v14
+; GFX1150-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v11, v15
+; GFX1150-NEXT:  .LBB13_7: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v14, 25, v18
+; GFX1150-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[12:13], v[10:11], v[12:13]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[12:13], v[8:9], v[10:11]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[10:11]
+; GFX1150-NEXT:    v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v9, v11, v9 :: v_dual_cndmask_b32 v8, v10, v8
+; GFX1150-NEXT:    v_ldexp_f64 v[8:9], v[8:9], v17
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v9, v16, v9
+; GFX1150-NEXT:  .LBB13_8:
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
+; GFX1150-NEXT:    v_and_b32_e32 v18, 0x80000000, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB13_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else16
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v11, v3, v18, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB13_11
+; GFX1150-NEXT:    s_branch .LBB13_16
+; GFX1150-NEXT:  .LBB13_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX1150-NEXT:  .LBB13_11: ; %frem.compute15
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v15, v[6:7]
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v14, v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1150-NEXT:    v_ldexp_f64 v[12:13], v[10:11], 26
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[10:11], |v[6:7]|
+; GFX1150-NEXT:    v_add_nc_u32_e32 v19, -1, v15
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v15
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v14
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_not_b32_e32 v15, v19
+; GFX1150-NEXT:    v_add_nc_u32_e32 v20, v15, v14
+; GFX1150-NEXT:    v_ldexp_f64 v[10:11], v[10:11], 1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_scale_f64 v[14:15], null, v[10:11], v[10:11], 1.0
+; GFX1150-NEXT:    v_rcp_f64_e32 v[16:17], v[14:15]
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX1150-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1150-NEXT:    v_fma_f64 v[21:22], -v[14:15], v[16:17], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[16:17], v[16:17], v[21:22], v[16:17]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[21:22], -v[14:15], v[16:17], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[16:17], v[16:17], v[21:22], v[16:17]
+; GFX1150-NEXT:    v_div_scale_f64 v[21:22], vcc_lo, 1.0, v[10:11], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[23:24], v[21:22], v[16:17]
+; GFX1150-NEXT:    v_fma_f64 v[14:15], -v[14:15], v[23:24], v[21:22]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_div_fmas_f64 v[14:15], v[14:15], v[16:17], v[23:24]
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v20
+; GFX1150-NEXT:    v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB13_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT:    s_sub_i32 s2, s2, s3
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_add_i32 s2, s2, 26
+; GFX1150-NEXT:    .p2align 6
+; GFX1150-NEXT:  .LBB13_13: ; %frem.loop_body23
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[12:13], v[16:17], v[14:15]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[12:13], v[12:13]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
-; GFX1150-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
-; GFX1150-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[12:13], v[10:11], v[16:17]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13]
+; GFX1150-NEXT:    v_add_f64 v[20:21], v[12:13], v[10:11]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
-; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX1150-NEXT:    v_dual_cndmask_b32 v13, v13, v21 :: v_dual_cndmask_b32 v12, v12, v20
+; GFX1150-NEXT:    v_ldexp_f64 v[12:13], v[12:13], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB13_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v12, v16
+; GFX1150-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v13, v17
+; GFX1150-NEXT:  .LBB13_15: ; %frem.loop_exit24
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
-; GFX1150-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v16, 25, v20
+; GFX1150-NEXT:    v_ldexp_f64 v[12:13], v[12:13], v16
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
-; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
-; GFX1150-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX1150-NEXT:    v_mul_f64 v[14:15], v[12:13], v[14:15]
+; GFX1150-NEXT:    v_rndne_f64_e32 v[14:15], v[14:15]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[14:15], v[10:11], v[12:13]
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[12:13]
+; GFX1150-NEXT:    v_add_f64 v[10:11], v[12:13], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_dual_cndmask_b32 v11, v13, v11 :: v_dual_cndmask_b32 v10, v12, v10
+; GFX1150-NEXT:    v_ldexp_f64 v[10:11], v[10:11], v19
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v11, v18, v11
+; GFX1150-NEXT:  .LBB13_16: ; %Flow53
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s2, v[4:5], 0x3fc
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s3, v[0:1], 0x1f8
+; GFX1150-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s3, v[2:3], 0x1f8
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_cmp_class_f64_e64 s2, v[6:7], 0x3fc
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v9, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc_lo
+; GFX1150-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    s_and_b32 s2, s2, s3
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e32 v3, 0x7ff80000, v11, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc_lo
+; GFX1150-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index d72f517cfb603..3b80350d85289 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -16,9 +16,9 @@
 ; CHECK-NEXT: Target Pass Configuration
 ; CHECK-NEXT: Machine Module Information
 ; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Machine Branch Probability Analysis

>From 05e90cdaf9f7f989a33b1fb3acf0d14f37ba130d Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 19 Mar 2025 12:19:14 -0400
Subject: [PATCH 15/19] Adjust freeze.ll tst

---
 llvm/test/CodeGen/AMDGPU/freeze.ll | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 96725e6996e3d..4e33a7867b3e1 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -6315,6 +6315,10 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-SDAG-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX6-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
@@ -6344,6 +6348,10 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-SDAG-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)

>From 485f446a1a0e97205df0ee854115ace9b059c1e6 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 20 Mar 2025 06:15:06 -0400
Subject: [PATCH 16/19] Remove exceptional handling to dagcombine

---
 llvm/lib/CodeGen/ExpandFp.cpp                 | 30 ++-----------------
 llvm/test/CodeGen/AMDGPU/dagcombine-select.ll | 11 +------
 2 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index fb8e333ab7a56..549de9dc64e9d 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -107,6 +107,8 @@ class FRemExpander {
         One(ConstantInt::get(ExTy, 1)), Signbit(Signbit) {};
 
   Value *createRcp(Value *V, const Twine &Name) const {
+    // Leave it to later optimizations to turn this into an rcp
+    // instruction if available.
     return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
   }
 
@@ -341,36 +343,8 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y,
 }
 } // namespace
 
-/// Return true if \p Op either is a constant or a selection
-/// instruction with constant operands.
-static bool isConstOrConstSelectOp(Value *Op) {
-  if (isa<Constant>(Op))
-    return true;
-
-  auto *S = dyn_cast<SelectInst>(Op);
-  if (!S)
-    return false;
-
-  return isa<Constant>(S->getTrueValue()) && isa<Constant>(S->getFalseValue());
-}
-
-/// Returns true if \p I should not be expanded because
-/// it will be eliminated during ISel.
-static bool shouldSkipExpandFRem(BinaryOperator &I) {
-  // This condition should be sufficient for DAGCombiner::visitFREM to
-  // eliminate the instruction.
-  return isConstOrConstSelectOp(I.getOperand(0)) &&
-         isConstOrConstSelectOp(I.getOperand(1));
-}
-
 static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
   LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
-  if (shouldSkipExpandFRem(I)) {
-    LLVM_DEBUG(
-        dbgs() << "Skipping 'frem' instruction that should be removed by "
-                  "DAGCombiner.\n");
-    return false;
-  }
 
   Type *ReturnTy = I.getType();
   assert(ReturnTy->isFPOrFPVectorTy());
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index f4d8ec180cf91..e7621a1e6a949 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -297,13 +297,4 @@ define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 %
   %bo = fdiv float 8.0, %sel
   store float %bo, ptr addrspace(1) %p, align 4
   ret void
-}
-
-; GCN-LABEL: {{^}}frem_constant_sel_constants:
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0,
-define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
-  %sel = select i1 %cond, float -4.0, float 3.0
-  %bo = frem float 5.0, %sel
-  store float %bo, ptr addrspace(1) %p, align 4
-  ret void
-}
+}
\ No newline at end of file

>From 8559b6d7905816569324935a7e81ed5ae049cc15 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 20 Mar 2025 06:15:49 -0400
Subject: [PATCH 17/19] Adjust test

---
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 90d994909264a..661f67d4989c4 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -20,9 +20,9 @@
 ; LAXX-NEXT: Target Pass Configuration
 ; LAXX-NEXT: Machine Module Information
 ; LAXX-NEXT: Target Transform Information
+; LAXX-NEXT: Assumption Cache Tracker
 ; LAXX-NEXT: Type-Based Alias Analysis
 ; LAXX-NEXT: Scoped NoAlias Alias Analysis
-; LAXX-NEXT: Assumption Cache Tracker
 ; LAXX-NEXT: Profile summary info
 ; LAXX-NEXT: Create Garbage Collector Module Metadata
 ; LAXX-NEXT: Machine Branch Probability Analysis

>From 772f855932e5b0b5dbbd396acd7a5c5f90857769 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 20 Mar 2025 06:17:42 -0400
Subject: [PATCH 18/19] clang-format changes

---
 llvm/lib/CodeGen/ExpandFp.cpp                 | 11 ++++++-----
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 549de9dc64e9d..ec1c33a0ecf85 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -27,8 +27,8 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -255,9 +255,9 @@ class FRemExpander {
 
   /// Return a value that is NaN if one of the corner cases concerning
   /// the inputs \p X and \p Y is detected, and \p Ret otherwise.
-  Value *handleInputCornerCases(Value *Ret, Value *X,
-                                Value *Y, std::optional<SimplifyQuery> &SQ,
-				bool NoInfs) const {
+  Value *handleInputCornerCases(Value *Ret, Value *X, Value *Y,
+                                std::optional<SimplifyQuery> &SQ,
+                                bool NoInfs) const {
     // Build:
     //   ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
     //   bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
@@ -1075,7 +1075,8 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
 
 namespace {
 class ExpandFpLegacyPass : public FunctionPass {
-    CodeGenOptLevel OptLevel;
+  CodeGenOptLevel OptLevel;
+
 public:
   static char ID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d1250e8bd3a84..796f0103b900a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1423,7 +1423,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
-  case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+  case ISD::SDIVREM:
+    return LowerSDIVREM(Op, DAG);
   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   case ISD::FRINT: return LowerFRINT(Op, DAG);

>From 111218942d4a3ac0469a9306d474c59433bf563f Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 20 Mar 2025 12:21:16 -0400
Subject: [PATCH 19/19] Adjust type

---
 llvm/lib/CodeGen/ExpandFp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index ec1c33a0ecf85..9b446e61b336f 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -85,9 +85,9 @@ class FRemExpander {
     if (Ty->is16bitFPTy())
       return FRemExpander{B, Ty, 11, 0x8000, B.getFloatTy(), B.getInt16Ty()};
     if (Ty->isFloatTy() || Ty->isHalfTy())
-      return FRemExpander{B, Ty, 12, 0x80000000L, Ty, B.getInt32Ty()};
+      return FRemExpander{B, Ty, 12, 0x80000000UL, Ty, B.getInt32Ty()};
     if (Ty->isDoubleTy())
-      return FRemExpander{B, Ty, 26, 0x8000000000000000L, Ty, B.getInt64Ty()};
+      return FRemExpander{B, Ty, 26, 0x8000000000000000UL, Ty, B.getInt64Ty()};
 
     return std::nullopt;
   }



More information about the llvm-commits mailing list