[llvm] [AMDGPU] Implement IR expansion for frem instruction (PR #130988)
Frederik Harwath via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 13 02:54:02 PDT 2025
https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/130988
>From b0f799670bceca75088b5b8457021dd7c1c11b85 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 25 Feb 2025 06:24:40 -0500
Subject: [PATCH 1/4] Implement IR expansion for frem instruction
This patch implements a correctly rounded expansion of the frem
instruction in LLVM IR. This is useful for target architectures where
such an expansion is too involved to be implement on ISel
Lowering. The expansion is based on the code from the AMD device libs
and has been tested successfully against the OpenCL conformance tests
on AMDGPU. The expansion is implemented in the preexisting
"expand-large-fp-convert" pass. It is enabled by a new
"shouldExpandFRemInIR" function in TargetLowering.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 +
llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 389 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 4547 ++++++++++++++++---
llvm/test/CodeGen/AMDGPU/wave32.ll | 384 +-
5 files changed, 4658 insertions(+), 668 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2089d47e9cbc8..b64c57fdba992 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5671,6 +5671,10 @@ class TargetLowering : public TargetLoweringBase {
LoadSDNode *OriginalLoad,
SelectionDAG &DAG) const;
+ /// Indicates whether the FRem instruction should be expanded before
+ /// ISel in the LLVM IR.
+ virtual bool shouldExpandFRemInIR() const { return false; };
+
private:
SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
const SDLoc &DL, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index ee583a25214ef..31d3779eb7c9f 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -6,11 +6,12 @@
//
//===----------------------------------------------------------------------===//
//
-
// This pass expands ‘fptoui .. to’, ‘fptosi .. to’, ‘uitofp .. to’,
// ‘sitofp .. to’ instructions with a bitwidth above a threshold into
// auto-generated functions. This is useful for targets like x86_64 that cannot
// lower fp convertions with more than 128 bits.
+// Furthermore, the pass can expand FRem instructions if requested in the
+// TargetLowering for the current target.
//
//===----------------------------------------------------------------------===//
@@ -21,6 +22,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/FMF.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/PassManager.h"
@@ -28,6 +30,9 @@
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "expand-large-fp-convert"
using namespace llvm;
@@ -37,6 +42,376 @@ static cl::opt<unsigned>
cl::desc("fp convert instructions on integers with "
"more than <N> bits are expanded."));
+namespace {
+/// This class implements a precise expansion of the frem instruction.
+/// The generated code is based on the fmod implementation in the AMD device
+/// libs.
+class FRemExpander {
+ /// The IRBuilder to use for the expansion.
+ IRBuilder<> &B;
+
+ /// Floating point type of the return value and the arguments of the FRem
+ /// instructions that should be expanded.
+ Type *FremTy;
+
+ /// Floating point type to use for the computation. This may be
+ /// wider than the \p FremTy.
+ Type *ComputeFpTy;
+
+ /// Integer type that can hold floating point values of type \p FremTY.
+ Type *IntTy;
+
+ /// Integer type used to hold the exponents returned by frexp.
+ Type *ExTy;
+
+ /// How many bits of the quotient to compute per iteration of the
+ /// algorithm, stored as a value of type \p ExTy.
+ Value *Bits;
+
+ /// Constant 1 of type \p ExTy.
+ Value *One;
+
+ /// The sign bit for floating point values of type \p FremTy.
+ const unsigned long Signbit;
+
+public:
+ static std::optional<FRemExpander> create(IRBuilder<> &B, Type *Ty) {
+ if (Ty->is16bitFPTy())
+ return FRemExpander{B, Ty, 11, 0x8000, B.getFloatTy(), B.getInt16Ty()};
+ if (Ty->isFloatTy() || Ty->isHalfTy())
+ return FRemExpander{B, Ty, 12, 0x80000000L, Ty, B.getInt32Ty()};
+ if (Ty->isDoubleTy())
+ return FRemExpander{B, Ty, 26, 0x8000000000000000L, Ty, B.getInt64Ty()};
+
+ return std::nullopt;
+ }
+
+ /// Build the FRem expansion for the numerator \p X and the
+ /// denumerator \p Y using the builder \p B. The type of X and Y
+ /// must match the type for which the class instance has been
+ /// created. The code will be generated at the insertion point of \p
+ /// B and the insertion point will be reset at exit.
+ Value *buildFRem(Value *X, Value *Y) const;
+
+private:
+ FRemExpander(IRBuilder<> &B, Type *FremTy, short Bits, unsigned long Signbit,
+ Type *ComputeFpTy, Type *IntTy)
+ : B(B), FremTy(FremTy), ComputeFpTy(ComputeFpTy), IntTy(IntTy),
+ ExTy(B.getInt32Ty()), Bits(ConstantInt::get(ExTy, Bits)),
+ One(ConstantInt::get(ExTy, 1)), Signbit(Signbit) {};
+
+ Value *createLdexp(Value *Base, Value *Exp, const Twine &Name) const {
+ return B.CreateIntrinsic(Intrinsic::ldexp, {ComputeFpTy, B.getInt32Ty()},
+ {Base, Exp}, {}, Name);
+ }
+
+ Value *createRcp(Value *V, const Twine &Name) const {
+ return B.CreateFDiv(ConstantFP::get(ComputeFpTy, 1.0), V, Name);
+ }
+
+ // Helper function to build the UPDATE_AX code which is common to the
+ // loop body and the "final iteration".
+ Value *buildUpdateAx(Value *Ax, Value *Ay, Value *Ayinv) const {
+ // Build:
+ // float q = BUILTIN_RINT_ComputeFpTy(ax * ayinv);
+ // ax = fnma(q, ay, ax);
+ // int clt = ax < 0.0f;
+ // float axp = ax + ay;
+ // ax = clt ? axp : ax;
+ Value *Q = B.CreateUnaryIntrinsic(Intrinsic::rint, B.CreateFMul(Ax, Ayinv),
+ {}, "q");
+ Value *AxUpdate = B.CreateIntrinsic(Intrinsic::fma, {ComputeFpTy},
+ {B.CreateFNeg(Q), Ay, Ax}, {}, "ax");
+ Value *Clt = B.CreateFCmp(CmpInst::FCMP_OLT, AxUpdate,
+ ConstantFP::get(ComputeFpTy, 0.0), "clt");
+ Value *Axp = B.CreateFAdd(AxUpdate, Ay, "axp");
+ AxUpdate = B.CreateSelect(Clt, Axp, AxUpdate, "ax");
+
+ return AxUpdate;
+ }
+
+ /// Build code to extract the exponent and mantissa of \p Src.
+ /// Return the exponent minus one for use as a loop bound and
+ /// the mantissa taken to the given \p NewExp power.
+ std::pair<Value *, Value *> buildExpAndPower(Value *Src, Value *NewExp,
+ const Twine &ExName,
+ const Twine &PowName) const {
+ // Build:
+ // ExName = BUILTIN_FREXP_EXP_ComputeFpTy(Src) - 1;
+ // PowName =
+ // BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ExName),
+ // NewExp);
+ Type *Ty = Src->getType();
+ Type *ExTy = B.getInt32Ty();
+ Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src);
+ Value *Mant = B.CreateExtractValue(Frexp, {0});
+ Value *Exp = B.CreateExtractValue(Frexp, {1});
+
+ Exp = B.CreateSub(Exp, One, ExName);
+ Value *Pow = createLdexp(Mant, NewExp, PowName);
+
+ return {Pow, Exp};
+ }
+
+ /// Build the main computation of the remainder for the case in which
+ /// Ax > Ay, where Ax = |X|, Ay = |Y|, and X is the numerator and Y the
+ /// denumerator. Add the incoming edge from the computation result
+ /// to \p RetPhi.
+ void buildRemainderComputation(Value *AxInitial, Value *AyInitial, Value *X,
+ PHINode *RetPhi) const {
+ // Build:
+ // ex = BUILTIN_FREXP_EXP_ComputeFpTy(ax) - 1;
+ // ax = BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ax),
+ // bits); ey = BUILTIN_FREXP_EXP_ComputeFpTy(ay) - 1; ay =
+ // BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ay), 1); auto
+ // [Ax, Ex]{getFrexpResults(B, AxInitial)};
+ auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax");
+ auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay");
+
+ // Build:
+ // int nb = ex - ey;
+ // float ayinv = MATH_FAST_RCP(ay);
+ Value *Nb = B.CreateSub(Ex, Ey, "nb");
+ Value *Ayinv = createRcp(Ay, "ayinv");
+
+ // Build: while (nb > bits)
+ BasicBlock *PreheaderBB = B.GetInsertBlock();
+ Function *Fun = PreheaderBB->getParent();
+ auto *LoopBB = BasicBlock::Create(B.getContext(), "frem.loop_body", Fun);
+ auto *ExitBB = BasicBlock::Create(B.getContext(), "frem.loop_exit", Fun);
+
+ B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, Nb, Bits), LoopBB, ExitBB);
+
+ // Build loop body:
+ // UPDATE_AX
+ // ax = BUILTIN_FLDEXP_ComputeFpTy(ax, bits);
+ // nb -= bits;
+ // One iteration of the loop is factored out. The code shared by
+ // the loop and this "iteration" is denoted by UPDATE_AX.
+ B.SetInsertPoint(LoopBB);
+ auto *NbIv = B.CreatePHI(Nb->getType(), 2, "nb_iv");
+ NbIv->addIncoming(Nb, PreheaderBB);
+
+ auto *AxPhi = B.CreatePHI(ComputeFpTy, 2, "ax_loop_phi");
+ AxPhi->addIncoming(Ax, PreheaderBB);
+
+ Value *AxPhiUpdate = buildUpdateAx(AxPhi, Ay, Ayinv);
+ AxPhiUpdate = createLdexp(AxPhiUpdate, Bits, "ax_update");
+ AxPhi->addIncoming(AxPhiUpdate, LoopBB);
+ NbIv->addIncoming(B.CreateSub(NbIv, Bits, "nb_update"), LoopBB);
+
+ B.CreateCondBr(B.CreateICmp(CmpInst::ICMP_SGT, NbIv, Bits), LoopBB, ExitBB);
+
+ // Build final iteration
+ // ax = BUILTIN_FLDEXP_ComputeFpTy(ax, nb - bits + 1);
+ // UPDATE_AX
+ B.SetInsertPoint(ExitBB);
+
+ auto *AxPhiExit = B.CreatePHI(ComputeFpTy, 2, "ax_exit_phi");
+ AxPhiExit->addIncoming(Ax, PreheaderBB);
+ AxPhiExit->addIncoming(AxPhi, LoopBB);
+ auto *NbExitPhi = B.CreatePHI(Nb->getType(), 2, "nb_exit_phi");
+ NbExitPhi->addIncoming(NbIv, LoopBB);
+ NbExitPhi->addIncoming(Nb, PreheaderBB);
+
+ Value *AxFinal = createLdexp(
+ AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), "ax");
+ AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
+
+ // Adjust exponent and sign
+ // ax = BUILTIN_FLDEXP_ComputeFpTy(ax, ey);
+ // ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax));
+ AxFinal = createLdexp(AxFinal, Ey, "ax");
+
+ Value *XAsInt = B.CreateBitCast(X, IntTy, "x_as_int");
+ if (ComputeFpTy != X->getType())
+ AxFinal = B.CreateFPTrunc(AxFinal, X->getType());
+
+ Value *AxAsInt = B.CreateBitCast(AxFinal, IntTy, "ax_as_int");
+
+ Value *Ret =
+ B.CreateXor(B.CreateAnd(XAsInt, Signbit), AxAsInt, "Remainder");
+ Ret = B.CreateBitCast(Ret, X->getType());
+
+ RetPhi->addIncoming(Ret, ExitBB);
+ }
+
+ /// Build the else-branch of the conditional in the FRem
+ /// expansion, i.e. the case in wich Ax <= Ay, where Ax = |X|, Ay
+ /// = |Y|, and X is the numerator and Y the denumerator. Add the
+ /// incoming edge from the result to \p RetPhi.
+ void buildElseBranch(Value *Ax, Value *Ay, Value *X, PHINode *RetPhi) const {
+ // Build:
+ // ret = ax == ay ? BUILTIN_COPYSIGN_ComputeFpTy(0.0f, x) : x;
+ Value *ZeroWithXSign = B.CreateIntrinsic(
+ Intrinsic::copysign, {FremTy}, {ConstantFP::get(FremTy, 0.0), X}, {});
+
+ Value *Ret = B.CreateSelect(B.CreateFCmpOEQ(Ax, Ay), ZeroWithXSign, X);
+
+ RetPhi->addIncoming(Ret, B.GetInsertBlock());
+ }
+
+ /// Adjust the result of the main computation from the FRem expansion
+ /// if NaNs or infinite values are possible.
+ Value *buildNanAndInfHandling(Value *Ret, Value *X, Value *Y) const {
+ // Build:
+ // ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
+ // bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
+ // BUILTIN_ISFINITE_ComputeFpTy(x); ret = c ? ret : QNAN_ComputeFpTy;
+ // TODO Handle NaN and infinity fast math flags separately here?
+ Value *Nan = ConstantFP::getQNaN(FremTy);
+
+ Ret = B.CreateSelect(B.createIsFPClass(Y, FPClassTest::fcZero), Nan, Ret);
+ Value *C = B.CreateLogicalAnd(
+ B.CreateNot(B.createIsFPClass(Y, FPClassTest::fcNan)),
+ B.createIsFPClass(X, FPClassTest::fcFinite));
+ Ret = B.CreateSelect(C, Ret, Nan);
+
+ return Ret;
+ }
+};
+
+Value *FRemExpander::buildFRem(Value *X, Value *Y) const {
+ assert(X->getType() == FremTy && Y->getType() == FremTy);
+
+ FastMathFlags FMF = B.getFastMathFlags();
+
+ // This function generates the following code structure:
+ // if (abs(x) > abs(y))
+ // { ret = compute remainder }
+ // else
+ // { ret = x or 0 with sign of x }
+ // Adjust ret to NaN/inf in input
+ // return ret
+ Value *Ax = B.CreateUnaryIntrinsic(Intrinsic::fabs, X, {}, "ax");
+ Value *Ay = B.CreateUnaryIntrinsic(Intrinsic::fabs, Y, {}, "ay");
+ if (ComputeFpTy != X->getType()) {
+ Ax = B.CreateFPExt(Ax, ComputeFpTy, "ax");
+ Ay = B.CreateFPExt(Ay, ComputeFpTy, "ay");
+ }
+ Value *AxAyCmp = B.CreateFCmpOGT(Ax, Ay);
+
+ PHINode *RetPhi = B.CreatePHI(FremTy, 2, "ret");
+ Value *Ret = RetPhi;
+
+ if (!FMF.noNaNs() || !FMF.noInfs())
+ Ret = buildNanAndInfHandling(Ret, X, Y);
+
+ Function *Fun = B.GetInsertBlock()->getParent();
+ auto *ThenBB = BasicBlock::Create(B.getContext(), "frem.compute", Fun);
+ auto *ElseBB = BasicBlock::Create(B.getContext(), "frem.else", Fun);
+ SplitBlockAndInsertIfThenElse(AxAyCmp, RetPhi, &ThenBB, &ElseBB);
+
+ auto SavedInsertPt = B.GetInsertPoint();
+
+ // Build remainder computation for "then" branch
+ //
+ // The ordered comparison ensures that ax and ay are not NaNs
+ // in the then-branch. Furthermore, y cannot be an infinity and the
+ // check at the end of the function ensures that the result will not
+ // be used if x is an infinity.
+ FastMathFlags ComputeFMF = FMF;
+ ComputeFMF.setNoInfs();
+ ComputeFMF.setNoNaNs();
+
+ B.SetInsertPoint(ThenBB);
+ B.setFastMathFlags(ComputeFMF);
+ buildRemainderComputation(Ax, Ay, X, RetPhi);
+ B.setFastMathFlags(FMF);
+ B.CreateBr(RetPhi->getParent());
+
+ // Build "else"-branch
+ B.SetInsertPoint(ElseBB);
+ buildElseBranch(Ax, Ay, X, RetPhi);
+ B.CreateBr(RetPhi->getParent());
+
+ B.SetInsertPoint(SavedInsertPt);
+
+ return Ret;
+}
+} // namespace
+
+/// Return true if \p Op either is a constant or a selection
+/// instruction with constant operands.
+static bool isConstOrConstSelectOp(Value *Op) {
+ if (isa<Constant>(Op))
+ return true;
+
+ auto *S = dyn_cast<SelectInst>(Op);
+ if (!S)
+ return false;
+
+ return isa<Constant>(S->getTrueValue()) && isa<Constant>(S->getFalseValue());
+}
+
+/// Returns true if \p I should not be expanded because
+/// it will be eliminated during ISel.
+static bool shouldSkipExpandFRem(BinaryOperator &I) {
+ // This condition should be sufficient for DAGCombiner::visitFREM to
+ // eliminate the instruction.
+ return isConstOrConstSelectOp(I.getOperand(0)) &&
+ isConstOrConstSelectOp(I.getOperand(1));
+}
+
+static bool expandFRem(BinaryOperator &I) {
+ LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
+ if (shouldSkipExpandFRem(I)) {
+ LLVM_DEBUG(
+ dbgs() << "Skipping 'frem' instruction that should be removed by "
+ "DAGCombiner.\n");
+ return false;
+ }
+
+ Type *ReturnTy = I.getType();
+ assert(ReturnTy->isFPOrFPVectorTy());
+
+ FastMathFlags FMF = I.getFastMathFlags();
+ // TODO Make use of those flags for optimization?
+ FMF.setAllowReciprocal(false);
+ FMF.setAllowContract(false);
+ FMF.setApproxFunc(false);
+
+ IRBuilder<> B(&I);
+ B.setFastMathFlags(FMF);
+ B.SetCurrentDebugLocation(I.getDebugLoc());
+
+ Type *ElemTy = ReturnTy->getScalarType();
+ const std::optional<FRemExpander> Expander = FRemExpander::create(B, ElemTy);
+
+ if (!Expander || isa<ScalableVectorType>(ReturnTy)) {
+ LLVM_DEBUG(dbgs() << "Cannot expand 'frem' of type " << ReturnTy << ".\n");
+ return false;
+ }
+
+ Value *Ret;
+ if (ReturnTy->isFloatingPointTy())
+ Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1));
+ else {
+ auto VecTy = cast<FixedVectorType>(ReturnTy);
+
+ // This could use SplitBlockAndInsertForEachLane but the interface
+ // is a bit awkward for a constant number of elements and it will
+ // boil down to the same code.
+ // TODO Expand the FRem instruction only once and reuse the code.
+ Value *Nums = I.getOperand(0);
+ Value *Denums = I.getOperand(1);
+ Ret = PoisonValue::get(I.getType());
+ for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
+ Value *Num = B.CreateExtractElement(Nums, I);
+ Value *Denum = B.CreateExtractElement(Denums, I);
+ Value *Rem = Expander->buildFRem(Num, Denum);
+ Ret = B.CreateInsertElement(Ret, Rem, I);
+ }
+ }
+
+ I.replaceAllUsesWith(Ret);
+ Ret->takeName(&I);
+ I.removeFromParent();
+ I.dropAllReferences();
+
+ return true;
+}
+
/// Generate code to convert a fp number to integer, replacing FPToS(U)I with
/// the generated code. This currently generates code similarly to compiler-rt's
/// implementations.
@@ -604,6 +979,12 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
for (auto &I : instructions(F)) {
switch (I.getOpcode()) {
+ case Instruction::FRem:
+ if (TLI.shouldExpandFRemInIR()) {
+ Replace.push_back(&I);
+ Modified = true;
+ }
+ break;
case Instruction::FPToUI:
case Instruction::FPToSI: {
// TODO: This pass doesn't handle scalable vectors.
@@ -654,8 +1035,10 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
while (!Replace.empty()) {
Instruction *I = Replace.pop_back_val();
- if (I->getOpcode() == Instruction::FPToUI ||
- I->getOpcode() == Instruction::FPToSI) {
+ if (I->getOpcode() == Instruction::FRem)
+ expandFRem(llvm::cast<BinaryOperator>(*I));
+ else if (I->getOpcode() == Instruction::FPToUI ||
+ I->getOpcode() == Instruction::FPToSI) {
expandFPToI(I);
} else {
expandIToFP(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c74dc7942f52c..b2b136c984bf4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -387,6 +387,8 @@ class AMDGPUTargetLowering : public TargetLowering {
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}
+ bool shouldExpandFRemInIR() const override { return true; };
+
};
namespace AMDGPUISD {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e4e6c44b051c3..e40d9690d832b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -7,59 +7,206 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x2
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT: s_cbranch_vccz .LBB0_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB0_2: ; %Flow18
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_and_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB0_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB0_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB0_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB0_5
+; CI-NEXT: s_branch .LBB0_7
+; CI-NEXT: .LBB0_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB0_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v0, s4, v0
+; CI-NEXT: .LBB0_8: ; %Flow19
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff
+; CI-NEXT: s_and_b32 s3, 0xffff, s3
+; CI-NEXT: s_cmp_eq_u32 s3, 0
+; CI-NEXT: s_cselect_b32 s4, 1, 0
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff
+; CI-NEXT: s_and_b32 s2, 0xffff, s2
+; CI-NEXT: s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s2, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT: s_cselect_b32 s3, 1, 0
+; CI-NEXT: s_and_b32 s2, s3, s2
+; CI-NEXT: s_and_b32 s3, 1, s4
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT: v_mov_b32_e32 v1, 0x7e00
+; CI-NEXT: s_and_b32 s2, 1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s1, 1
+; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x8
+; VI-NEXT: s_load_dword s0, s[10:11], 0x0
+; VI-NEXT: s_load_dword s2, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_rcp_f32_e32 v3, v2
-; VI-NEXT: v_mul_f32_e32 v4, v0, v3
-; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT: v_mac_f32_e32 v4, v5, v3
-; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT: v_mul_f32_e32 v0, v0, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT: v_add_f32_e32 v0, v0, v4
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT: v_cvt_f32_f16_e64 v0, |s2|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0
+; VI-NEXT: s_cbranch_vccz .LBB0_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: .LBB0_2: ; %Flow18
+; VI-NEXT: s_xor_b32 s1, s1, 1
+; VI-NEXT: s_and_b32 s1, s1, 1
+; VI-NEXT: s_cmp_lg_u32 s1, 0
+; VI-NEXT: s_cbranch_scc1 .LBB0_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v2
+; VI-NEXT: v_ldexp_f32 v4, v1, 11
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB0_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB0_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB0_5
+; VI-NEXT: s_branch .LBB0_7
+; VI-NEXT: .LBB0_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB0_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v1, s1, v0
+; VI-NEXT: .LBB0_8: ; %Flow19
+; VI-NEXT: v_mov_b32_e32 v0, 0x60
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT: v_cmp_class_f16_e64 s[0:1], s0, v0
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -75,35 +222,176 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x2
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v0, |s3|
+; CI-NEXT: s_mov_b32 s3, 1
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0
+; CI-NEXT: s_cbranch_vccz .LBB1_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s3, s2, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s3, 0
+; CI-NEXT: .LBB1_2: ; %Flow18
+; CI-NEXT: s_xor_b32 s3, s3, 1
+; CI-NEXT: s_and_b32 s3, s3, 1
+; CI-NEXT: s_cmp_lg_u32 s3, 0
+; CI-NEXT: s_cbranch_scc1 .LBB1_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v1, 11
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v0
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB1_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB1_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB1_5
+; CI-NEXT: s_branch .LBB1_7
+; CI-NEXT: .LBB1_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB1_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: s_and_b32 s2, s2, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v1, s2, v0
+; CI-NEXT: .LBB1_8: ; %Flow19
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_rcp_f32_e32 v2, v1
-; CI-NEXT: v_mul_f32_e32 v2, v0, v2
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT: buffer_store_short v1, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: ; implicit-def: $vgpr2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_load_dword s3, s[4:5], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s3
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s3, v1
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: v_cvt_f32_f16_e64 v1, |s2|
+; VI-NEXT: v_cvt_f32_f16_e64 v0, |s3|
+; VI-NEXT: s_mov_b32 s3, 1
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v1, v0
+; VI-NEXT: s_cbranch_vccz .LBB1_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s3, s2, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v1, v0
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: .LBB1_2: ; %Flow18
+; VI-NEXT: s_xor_b32 s3, s3, 1
+; VI-NEXT: s_and_b32 s3, s3, 1
+; VI-NEXT: s_cmp_lg_u32 s3, 0
+; VI-NEXT: s_cbranch_scc1 .LBB1_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v2, v1
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v5
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_ldexp_f32 v4, v2, 11
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v3, v0
+; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB1_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB1_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB1_5
+; VI-NEXT: s_branch .LBB1_7
+; VI-NEXT: .LBB1_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB1_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_and_b32 s2, s2, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v2, s2, v0
+; VI-NEXT: .LBB1_8: ; %Flow19
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
@@ -121,37 +409,182 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x2
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT: s_cbranch_vccz .LBB2_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB2_2: ; %Flow18
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_and_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB2_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0
; CI-NEXT: v_rcp_f32_e32 v2, v1
-; CI-NEXT: v_mul_f32_e32 v2, v0, v2
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT: s_cbranch_vccnz .LBB2_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
+; CI-NEXT: .LBB2_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v2
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT: s_cbranch_vccnz .LBB2_5
+; CI-NEXT: s_branch .LBB2_7
+; CI-NEXT: .LBB2_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB2_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v5, v3
+; CI-NEXT: v_mul_f32_e32 v2, v3, v2
+; CI-NEXT: v_rndne_f32_e32 v2, v2
+; CI-NEXT: v_fma_f32 v2, -v2, v1, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v0, s4, v0
+; CI-NEXT: .LBB2_8: ; %Flow19
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff
+; CI-NEXT: s_and_b32 s3, 0xffff, s3
+; CI-NEXT: s_cmp_eq_u32 s3, 0
+; CI-NEXT: s_cselect_b32 s4, 1, 0
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff
+; CI-NEXT: s_and_b32 s2, 0xffff, s2
+; CI-NEXT: s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s2, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT: s_cselect_b32 s3, 1, 0
+; CI-NEXT: s_and_b32 s2, s3, s2
+; CI-NEXT: s_and_b32 s3, 1, s4
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3
+; CI-NEXT: v_mov_b32_e32 v1, 0x7e00
+; CI-NEXT: s_and_b32 s2, 1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s1, 1
+; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x8
+; VI-NEXT: s_load_dword s0, s[10:11], 0x0
+; VI-NEXT: s_load_dword s2, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s3
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s3, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT: v_cvt_f32_f16_e64 v0, |s2|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v0
+; VI-NEXT: s_cbranch_vccz .LBB2_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: .LBB2_2: ; %Flow18
+; VI-NEXT: s_xor_b32 s1, s1, 1
+; VI-NEXT: s_and_b32 s1, s1, 1
+; VI-NEXT: s_cmp_lg_u32 s1, 0
+; VI-NEXT: s_cbranch_scc1 .LBB2_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT: v_ldexp_f32 v4, v1, 11
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v0
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0
+; VI-NEXT: v_rcp_f32_e32 v2, v1
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT: s_cbranch_vccnz .LBB2_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
+; VI-NEXT: .LBB2_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v2
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT: s_cbranch_vccnz .LBB2_5
+; VI-NEXT: s_branch .LBB2_7
+; VI-NEXT: .LBB2_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB2_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT: v_ldexp_f32 v3, v5, v3
+; VI-NEXT: v_mul_f32_e32 v2, v3, v2
+; VI-NEXT: v_rndne_f32_e32 v2, v2
+; VI-NEXT: v_fma_f32 v2, -v2, v1, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v1, s1, v0
+; VI-NEXT: .LBB2_8: ; %Flow19
+; VI-NEXT: v_mov_b32_e32 v0, 0x60
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT: v_cmp_class_f16_e64 s[0:1], s0, v0
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: v_mov_b32_e32 v0, 0x7e00
+; VI-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
@@ -168,27 +601,96 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s6, s[2:3], 0x0
-; CI-NEXT: s_load_dword s2, s[4:5], 0x4
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
+; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB3_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6
-; CI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
-; CI-NEXT: v_rcp_f32_e32 v3, v1
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB3_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_and_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB3_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT: v_fma_f32 v3, v4, v3, v3
-; CI-NEXT: v_mul_f32_e32 v4, v2, v3
-; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT: v_fma_f32 v4, v5, v3, v4
-; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB3_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB3_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB3_5
+; CI-NEXT: s_branch .LBB3_7
+; CI-NEXT: .LBB3_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB3_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v0, s4, v0
+; CI-NEXT: .LBB3_8: ; %Flow17
+; CI-NEXT: v_mov_b32_e32 v1, 0x60
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s3, v1
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s2, v2
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s3, 3
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
-; CI-NEXT: v_trunc_f32_e32 v1, v1
-; CI-NEXT: v_fma_f32 v0, -v1, v0, s6
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -197,25 +699,94 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s6, s[2:3], 0x0
-; VI-NEXT: s_load_dword s2, s[4:5], 0x10
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_mov_b32 s4, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB3_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6
-; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
-; VI-NEXT: v_rcp_f32_e32 v3, v1
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: .LBB3_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s4, s4, 1
+; VI-NEXT: s_and_b32 s4, s4, 1
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cbranch_scc1 .LBB3_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
-; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, -v1, v0, s6
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB3_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB3_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB3_5
+; VI-NEXT: s_branch .LBB3_7
+; VI-NEXT: .LBB3_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB3_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v0, s4, v0
+; VI-NEXT: .LBB3_8: ; %Flow17
+; VI-NEXT: v_mov_b32_e32 v1, 0x60
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s3, v1
+; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v2
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s3, 3
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -236,12 +807,82 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: v_rcp_f32_e32 v0, s3
-; CI-NEXT: v_mul_f32_e32 v0, s2, v0
-; CI-NEXT: v_trunc_f32_e32 v0, v0
-; CI-NEXT: v_fma_f32 v0, -v0, s3, v1
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB4_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB4_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_and_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB4_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB4_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB4_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB4_5
+; CI-NEXT: s_branch .LBB4_7
+; CI-NEXT: .LBB4_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB4_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: s_and_b32 s2, s2, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v0, s2, v0
+; CI-NEXT: .LBB4_8: ; %Flow17
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -254,15 +895,85 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_mov_b32 s4, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s3
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s3, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB4_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: .LBB4_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s4, s4, 1
+; VI-NEXT: s_and_b32 s4, s4, 1
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cbranch_scc1 .LBB4_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB4_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB4_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB4_5
+; VI-NEXT: s_branch .LBB4_7
+; VI-NEXT: .LBB4_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB4_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: s_and_b32 s2, s2, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v0, s2, v0
+; VI-NEXT: .LBB4_8: ; %Flow17
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: flat_store_dword v[1:2], v0
; VI-NEXT: s_endpgm
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -280,12 +991,80 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: v_rcp_f32_e32 v0, s3
-; CI-NEXT: v_mul_f32_e32 v0, s2, v0
-; CI-NEXT: v_trunc_f32_e32 v0, v0
-; CI-NEXT: v_fma_f32 v0, -v0, s3, v1
+; CI-NEXT: v_mov_b32_e32 v0, s3
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB5_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB5_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_and_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB5_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0
+; CI-NEXT: v_rcp_f32_e32 v2, v1
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT: s_cbranch_vccnz .LBB5_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
+; CI-NEXT: .LBB5_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v2
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT: s_cbranch_vccnz .LBB5_5
+; CI-NEXT: s_branch .LBB5_7
+; CI-NEXT: .LBB5_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB5_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v5, v3
+; CI-NEXT: v_mul_f32_e32 v2, v3, v2
+; CI-NEXT: v_rndne_f32_e32 v2, v2
+; CI-NEXT: v_fma_f32 v2, -v2, v1, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: s_and_b32 s4, s2, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v0, s4, v0
+; CI-NEXT: .LBB5_8: ; %Flow17
+; CI-NEXT: v_mov_b32_e32 v1, 0x60
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s3, v1
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v2, 0x1f8
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s2, v2
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s3, 3
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -298,12 +1077,80 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_mov_b32 s4, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s3
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
-; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s3, v1
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB5_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: .LBB5_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s4, s4, 1
+; VI-NEXT: s_and_b32 s4, s4, 1
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cbranch_scc1 .LBB5_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0
+; VI-NEXT: v_rcp_f32_e32 v2, v1
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT: s_cbranch_vccnz .LBB5_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6
+; VI-NEXT: .LBB5_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v2
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT: s_cbranch_vccnz .LBB5_5
+; VI-NEXT: s_branch .LBB5_7
+; VI-NEXT: .LBB5_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB5_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v3, v5, v3
+; VI-NEXT: v_mul_f32_e32 v2, v3, v2
+; VI-NEXT: v_rndne_f32_e32 v2, v2
+; VI-NEXT: v_fma_f32 v2, -v2, v1, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: s_and_b32 s4, s2, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v0, s4, v0
+; VI-NEXT: .LBB5_8: ; %Flow17
+; VI-NEXT: v_mov_b32_e32 v1, 0x60
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s3, v1
+; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v2, 0x1f8
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v2
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s3, 3
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -321,25 +1168,109 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
-; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
-; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
-; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
-; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB6_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mov_b32_e32 v0, s6
+; CI-NEXT: v_mov_b32_e32 v1, s7
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB6_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB6_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB6_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT: .LBB6_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT: s_cbranch_vccnz .LBB6_5
+; CI-NEXT: s_branch .LBB6_7
+; CI-NEXT: .LBB6_6:
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: .LBB6_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT: v_xor_b32_e32 v0, s6, v0
+; CI-NEXT: v_xor_b32_e32 v1, s7, v1
+; CI-NEXT: .LBB6_8: ; %Flow17
+; CI-NEXT: v_mov_b32_e32 v2, 0x60
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2
+; CI-NEXT: v_mov_b32_e32 v2, 0x7ff80000
+; CI-NEXT: v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[2:3], v3
+; CI-NEXT: v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -349,26 +1280,110 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
-; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
-; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB6_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB6_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_and_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB6_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], 1.0
+; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB6_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT: .LBB6_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT: s_cbranch_vccnz .LBB6_5
+; VI-NEXT: s_branch .LBB6_7
+; VI-NEXT: .LBB6_6:
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: .LBB6_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT: v_xor_b32_e32 v0, s6, v0
+; VI-NEXT: v_xor_b32_e32 v1, s7, v1
+; VI-NEXT: .LBB6_8: ; %Flow17
+; VI-NEXT: v_mov_b32_e32 v2, 0x60
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2
+; VI-NEXT: v_mov_b32_e32 v2, 0x7ff80000
+; VI-NEXT: v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[2:3], v3
+; VI-NEXT: v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -384,24 +1399,99 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB7_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mov_b32_e32 v0, s6
+; CI-NEXT: v_mov_b32_e32 v1, s7
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB7_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB7_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB7_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT: .LBB7_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT: s_cbranch_vccnz .LBB7_5
+; CI-NEXT: s_branch .LBB7_7
+; CI-NEXT: .LBB7_6:
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: .LBB7_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: s_brev_b32 s5, 1
+; CI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
+; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT: v_xor_b32_e32 v0, s2, v0
+; CI-NEXT: v_xor_b32_e32 v1, s3, v1
+; CI-NEXT: .LBB7_8: ; %Flow17
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -409,22 +1499,97 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB7_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB7_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_and_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB7_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
+; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB7_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT: .LBB7_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT: s_cbranch_vccnz .LBB7_5
+; VI-NEXT: s_branch .LBB7_7
+; VI-NEXT: .LBB7_6:
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: .LBB7_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_brev_b32 s5, 1
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
+; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT: v_xor_b32_e32 v0, s2, v0
+; VI-NEXT: v_xor_b32_e32 v1, s3, v1
+; VI-NEXT: .LBB7_8: ; %Flow17
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -441,24 +1606,107 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB8_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mov_b32_e32 v0, s6
+; CI-NEXT: v_mov_b32_e32 v1, s7
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB8_2: ; %Flow16
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB8_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; CI-NEXT: v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; CI-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; CI-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; CI-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; CI-NEXT: s_cbranch_vccnz .LBB8_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT: .LBB8_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT: s_cbranch_vccnz .LBB8_5
+; CI-NEXT: s_branch .LBB8_7
+; CI-NEXT: .LBB8_6:
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: .LBB8_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_brev_b32 s7, 1
+; CI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT: v_xor_b32_e32 v0, s6, v0
+; CI-NEXT: v_xor_b32_e32 v1, s7, v1
+; CI-NEXT: .LBB8_8: ; %Flow17
+; CI-NEXT: v_mov_b32_e32 v2, 0x60
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2
+; CI-NEXT: v_mov_b32_e32 v2, 0x7ff80000
+; CI-NEXT: v_mov_b32_e32 v3, 0x1f8
+; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[2:3], v3
+; CI-NEXT: v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -466,23 +1714,106 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0
-; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB8_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[2:3]|, |v[0:1]|
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
-; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB8_2: ; %Flow16
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_and_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB8_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[2:3]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[4:5]|
+; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT: v_rcp_f64_e32 v[2:3], v[0:1]
+; VI-NEXT: v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[10:11], -v[0:1], v[2:3], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3]
+; VI-NEXT: s_cbranch_vccnz .LBB8_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT: .LBB8_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT: s_cbranch_vccnz .LBB8_5
+; VI-NEXT: s_branch .LBB8_7
+; VI-NEXT: .LBB8_6:
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: .LBB8_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: s_brev_b32 s7, 1
+; VI-NEXT: s_and_b64 s[6:7], s[2:3], s[6:7]
+; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT: v_xor_b32_e32 v0, s6, v0
+; VI-NEXT: v_xor_b32_e32 v1, s7, v1
+; VI-NEXT: .LBB8_8: ; %Flow17
+; VI-NEXT: v_mov_b32_e32 v2, 0x60
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2
+; VI-NEXT: v_mov_b32_e32 v2, 0x7ff80000
+; VI-NEXT: v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[2:3], v3
+; VI-NEXT: v_cmp_class_f64_e64 s[2:3], s[4:5], 3
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -499,100 +1830,393 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: s_load_dword s3, s[4:5], 0x4
+; CI-NEXT: s_mov_b32 s4, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
-; CI-NEXT: s_lshr_b32 s4, s2, 16
-; CI-NEXT: s_lshr_b32 s5, s3, 16
-; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |s3|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT: s_cbranch_vccz .LBB9_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: s_mov_b32 s4, 0
+; CI-NEXT: .LBB9_2: ; %Flow60
+; CI-NEXT: s_xor_b32 s4, s4, 1
+; CI-NEXT: s_and_b32 s4, s4, 1
+; CI-NEXT: s_cmp_lg_u32 s4, 0
+; CI-NEXT: s_cbranch_scc1 .LBB9_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s5
-; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB9_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB9_5
+; CI-NEXT: s_branch .LBB9_7
+; CI-NEXT: .LBB9_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v1
-; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
-; CI-NEXT: v_rcp_f32_e32 v5, v3
+; CI-NEXT: s_and_b32 s4, s2, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v0, s4, v0
+; CI-NEXT: .LBB9_8: ; %Flow61
+; CI-NEXT: s_lshr_b32 s4, s2, 16
+; CI-NEXT: s_lshr_b32 s5, s3, 16
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |s4|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s5|
+; CI-NEXT: s_mov_b32 s6, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; CI-NEXT: s_cbranch_vccz .LBB9_10
+; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: s_and_b32 s6, s4, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; CI-NEXT: v_mov_b32_e32 v1, s6
+; CI-NEXT: v_mov_b32_e32 v4, s4
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB9_10: ; %Flow56
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB9_16
+; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
+; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT: v_fma_f32 v5, v6, v5, v5
-; CI-NEXT: v_mul_f32_e32 v6, v4, v5
-; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT: v_fma_f32 v6, v7, v5, v6
-; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
-; CI-NEXT: v_trunc_f32_e32 v3, v3
-; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB9_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT: s_cbranch_vccnz .LBB9_13
+; CI-NEXT: s_branch .LBB9_15
+; CI-NEXT: .LBB9_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: s_and_b32 s6, s4, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v1, s6, v1
+; CI-NEXT: .LBB9_16: ; %Flow57
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff
+; CI-NEXT: s_and_b32 s3, 0xffff, s3
+; CI-NEXT: s_cmp_eq_u32 s3, 0
+; CI-NEXT: s_cselect_b32 s6, 1, 0
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff
+; CI-NEXT: s_and_b32 s2, 0xffff, s2
+; CI-NEXT: s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s2, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT: s_cselect_b32 s3, 1, 0
+; CI-NEXT: s_and_b32 s2, s3, s2
+; CI-NEXT: s_and_b32 s3, s5, 0x7fff
+; CI-NEXT: s_and_b32 s3, 0xffff, s3
+; CI-NEXT: s_cmp_eq_u32 s3, 0
+; CI-NEXT: s_cselect_b32 s5, 1, 0
+; CI-NEXT: s_and_b32 s4, s4, 0x7fff
+; CI-NEXT: s_and_b32 s4, 0xffff, s4
+; CI-NEXT: s_cmpk_lt_u32 s4, 0x7c00
+; CI-NEXT: s_cselect_b32 s4, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s3, 0x7c00
+; CI-NEXT: s_cselect_b32 s3, 1, 0
+; CI-NEXT: s_and_b32 s3, s3, s4
+; CI-NEXT: s_and_b32 s4, 1, s6
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
+; CI-NEXT: s_and_b32 s2, 1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s5
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s3
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
+; CI-NEXT: s_mov_b32 s2, -1
+; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s1, 1
+; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_load_dword s0, s[10:11], 0x0
+; VI-NEXT: s_load_dword s2, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v2
-; VI-NEXT: v_mul_f32_e32 v4, v0, v3
-; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT: v_mac_f32_e32 v4, v5, v3
-; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT: v_mul_f32_e32 v0, v0, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s0|
+; VI-NEXT: v_cvt_f32_f16_e64 v1, |s2|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; VI-NEXT: s_cbranch_vccz .LBB9_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: .LBB9_2: ; %Flow60
+; VI-NEXT: s_xor_b32 s1, s1, 1
+; VI-NEXT: s_and_b32 s1, s1, 1
+; VI-NEXT: s_cmp_lg_u32 s1, 0
+; VI-NEXT: s_cbranch_scc1 .LBB9_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; VI-NEXT: v_ldexp_f32 v1, v3, 1
+; VI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 11
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB9_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB9_5
+; VI-NEXT: s_branch .LBB9_7
+; VI-NEXT: .LBB9_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s5
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; VI-NEXT: v_rcp_f32_e32 v4, v3
-; VI-NEXT: v_mul_f32_e32 v5, v1, v4
-; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
-; VI-NEXT: v_mac_f32_e32 v5, v6, v4
-; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
-; VI-NEXT: v_mul_f32_e32 v1, v1, v4
-; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; VI-NEXT: v_add_f32_e32 v1, v1, v5
+; VI-NEXT: s_and_b32 s1, s0, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v0, s1, v0
+; VI-NEXT: .LBB9_8: ; %Flow61
+; VI-NEXT: s_lshr_b32 s4, s0, 16
+; VI-NEXT: s_lshr_b32 s6, s2, 16
+; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4|
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s6|
+; VI-NEXT: s_mov_b32 s1, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; VI-NEXT: s_cbranch_vccz .LBB9_10
+; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: s_and_b32 s1, s4, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: .LBB9_10: ; %Flow56
+; VI-NEXT: s_xor_b32 s1, s1, 1
+; VI-NEXT: s_and_b32 s1, s1, 1
+; VI-NEXT: s_cmp_lg_u32 s1, 0
+; VI-NEXT: s_cbranch_scc1 .LBB9_16
+; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; VI-NEXT: v_ldexp_f32 v2, v4, 1
+; VI-NEXT: v_div_scale_f32 v4, s[10:11], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 11
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB9_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT: s_cbranch_vccnz .LBB9_13
+; VI-NEXT: s_branch .LBB9_15
+; VI-NEXT: .LBB9_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s4
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s4
+; VI-NEXT: s_and_b32 s1, s4, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v1, s1, v1
+; VI-NEXT: .LBB9_16: ; %Flow57
+; VI-NEXT: v_mov_b32_e32 v2, 0x60
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x1f8
+; VI-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
+; VI-NEXT: v_cmp_class_f16_e64 s[0:1], s0, v3
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: v_cmp_class_f16_e64 s[2:3], s6, v2
+; VI-NEXT: v_cmp_class_f16_e64 s[6:7], s6, 3
+; VI-NEXT: v_cmp_class_f16_e64 s[4:5], s4, v3
+; VI-NEXT: s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
+; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1]
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -608,174 +2232,757 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
+; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
-; CI-NEXT: s_lshr_b32 s8, s2, 16
-; CI-NEXT: s_lshr_b32 s9, s3, 16
-; CI-NEXT: s_lshr_b32 s10, s4, 16
-; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0
-; CI-NEXT: s_lshr_b32 s11, s5, 16
-; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s2|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, |s4|
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; CI-NEXT: s_cbranch_vccz .LBB10_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s6, s2, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; CI-NEXT: v_mov_b32_e32 v0, s6
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB10_2: ; %Flow144
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
+; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 11
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, s8
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s10
-; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; CI-NEXT: s_cbranch_vccnz .LBB10_5
+; CI-NEXT: s_branch .LBB10_7
+; CI-NEXT: .LBB10_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1
-; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1
-; CI-NEXT: v_rcp_f32_e32 v5, v3
+; CI-NEXT: s_and_b32 s6, s2, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v0, s6, v0
+; CI-NEXT: .LBB10_8: ; %Flow145
+; CI-NEXT: s_lshr_b32 s6, s2, 16
+; CI-NEXT: s_lshr_b32 s7, s4, 16
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |s6|
+; CI-NEXT: v_cvt_f32_f16_e64 v2, |s7|
+; CI-NEXT: s_mov_b32 s8, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; CI-NEXT: s_cbranch_vccz .LBB10_10
+; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: s_and_b32 s8, s6, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; CI-NEXT: v_mov_b32_e32 v1, s8
+; CI-NEXT: v_mov_b32_e32 v4, s6
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: s_mov_b32 s8, 0
+; CI-NEXT: .LBB10_10: ; %Flow140
+; CI-NEXT: s_xor_b32 s8, s8, 1
+; CI-NEXT: s_and_b32 s8, s8, 1
+; CI-NEXT: s_cmp_lg_u32 s8, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_16
+; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
+; CI-NEXT: v_div_scale_f32 v4, s[8:9], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 11
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT: v_fma_f32 v5, v6, v5, v5
-; CI-NEXT: v_mul_f32_e32 v6, v4, v5
-; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT: v_fma_f32 v6, v7, v5, v6
-; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
-; CI-NEXT: v_trunc_f32_e32 v3, v3
-; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v3, s5
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; CI-NEXT: s_cbranch_vccnz .LBB10_13
+; CI-NEXT: s_branch .LBB10_15
+; CI-NEXT: .LBB10_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2
-; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2
-; CI-NEXT: v_rcp_f32_e32 v6, v4
+; CI-NEXT: s_and_b32 s8, s6, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v1, s8, v1
+; CI-NEXT: .LBB10_16: ; %Flow141
+; CI-NEXT: v_cvt_f32_f16_e64 v4, |s3|
+; CI-NEXT: v_cvt_f32_f16_e64 v3, |s5|
+; CI-NEXT: s_mov_b32 s8, 1
+; CI-NEXT: ; implicit-def: $vgpr2
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
+; CI-NEXT: s_cbranch_vccz .LBB10_18
+; CI-NEXT: ; %bb.17: ; %frem.else56
+; CI-NEXT: s_and_b32 s8, s3, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
+; CI-NEXT: v_mov_b32_e32 v2, s8
+; CI-NEXT: v_mov_b32_e32 v5, s3
+; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CI-NEXT: s_mov_b32 s8, 0
+; CI-NEXT: .LBB10_18: ; %Flow136
+; CI-NEXT: s_xor_b32 s8, s8, 1
+; CI-NEXT: s_and_b32 s8, s8, 1
+; CI-NEXT: s_cmp_lg_u32 s8, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_24
+; CI-NEXT: ; %bb.19: ; %frem.compute55
+; CI-NEXT: v_frexp_mant_f32_e32 v5, v3
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
+; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1
+; CI-NEXT: v_div_scale_f32 v5, s[8:9], v3, v3, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v2, v4
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
+; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7
+; CI-NEXT: v_ldexp_f32_e64 v6, v2, 11
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; CI-NEXT: v_rcp_f32_e32 v10, v5
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
-; CI-NEXT: v_fma_f32 v6, v7, v6, v6
-; CI-NEXT: v_mul_f32_e32 v7, v5, v6
-; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
-; CI-NEXT: v_fma_f32 v7, v8, v6, v7
-; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
+; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; CI-NEXT: v_fma_f32 v10, v11, v10, v10
+; CI-NEXT: v_mul_f32_e32 v11, v9, v10
+; CI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; CI-NEXT: v_fma_f32 v11, v12, v10, v11
+; CI-NEXT: v_fma_f32 v5, -v5, v11, v9
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
-; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2
-; CI-NEXT: v_trunc_f32_e32 v4, v4
-; CI-NEXT: v_fma_f32 v2, -v4, v3, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v3, s9
-; CI-NEXT: v_cvt_f32_f16_e32 v4, s11
+; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
+; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_22
+; CI-NEXT: ; %bb.20: ; %frem.loop_body63.preheader
+; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
+; CI-NEXT: .LBB10_21: ; %frem.loop_body63
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: v_mul_f32_e32 v6, v7, v5
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT: v_add_f32_e32 v8, v6, v3
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT: v_ldexp_f32_e64 v6, v6, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4
+; CI-NEXT: s_cbranch_vccnz .LBB10_21
+; CI-NEXT: s_branch .LBB10_23
+; CI-NEXT: .LBB10_22:
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit64
+; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4
+; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
+; CI-NEXT: v_mul_f32_e32 v5, v4, v5
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v3, v4, v3
+; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_div_scale_f32 v5, s[2:3], v4, v4, v3
-; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3
-; CI-NEXT: v_rcp_f32_e32 v7, v5
+; CI-NEXT: s_and_b32 s8, s3, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v2, s8, v2
+; CI-NEXT: .LBB10_24: ; %Flow137
+; CI-NEXT: s_lshr_b32 s8, s3, 16
+; CI-NEXT: s_lshr_b32 s9, s5, 16
+; CI-NEXT: v_cvt_f32_f16_e64 v5, |s8|
+; CI-NEXT: v_cvt_f32_f16_e64 v4, |s9|
+; CI-NEXT: s_mov_b32 s10, 1
+; CI-NEXT: ; implicit-def: $vgpr3
+; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
+; CI-NEXT: s_cbranch_vccz .LBB10_26
+; CI-NEXT: ; %bb.25: ; %frem.else92
+; CI-NEXT: s_and_b32 s10, s8, 0xffff8000
+; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
+; CI-NEXT: v_mov_b32_e32 v3, s10
+; CI-NEXT: v_mov_b32_e32 v6, s8
+; CI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; CI-NEXT: s_mov_b32 s10, 0
+; CI-NEXT: .LBB10_26: ; %Flow132
+; CI-NEXT: s_xor_b32 s10, s10, 1
+; CI-NEXT: s_and_b32 s10, s10, 1
+; CI-NEXT: s_cmp_lg_u32 s10, 0
+; CI-NEXT: s_cbranch_scc1 .LBB10_32
+; CI-NEXT: ; %bb.27: ; %frem.compute91
+; CI-NEXT: v_frexp_mant_f32_e32 v6, v4
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
+; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1
+; CI-NEXT: v_div_scale_f32 v6, s[10:11], v4, v4, 1.0
+; CI-NEXT: v_frexp_mant_f32_e32 v3, v5
+; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5
+; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8
+; CI-NEXT: v_ldexp_f32_e64 v7, v3, 11
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3
+; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; CI-NEXT: v_rcp_f32_e32 v11, v6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
-; CI-NEXT: v_fma_f32 v7, v8, v7, v7
-; CI-NEXT: v_mul_f32_e32 v8, v6, v7
-; CI-NEXT: v_fma_f32 v9, -v5, v8, v6
-; CI-NEXT: v_fma_f32 v8, v9, v7, v8
-; CI-NEXT: v_fma_f32 v5, -v5, v8, v6
+; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; CI-NEXT: v_fma_f32 v11, v12, v11, v11
+; CI-NEXT: v_mul_f32_e32 v12, v10, v11
+; CI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; CI-NEXT: v_fma_f32 v12, v13, v11, v12
+; CI-NEXT: v_fma_f32 v6, -v6, v12, v10
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
+; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
+; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB10_30
+; CI-NEXT: ; %bb.28: ; %frem.loop_body99.preheader
+; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
+; CI-NEXT: .LBB10_29: ; %frem.loop_body99
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: v_mul_f32_e32 v7, v8, v6
+; CI-NEXT: v_rndne_f32_e32 v7, v7
+; CI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT: v_add_f32_e32 v9, v7, v4
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT: v_ldexp_f32_e64 v7, v7, 11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5
+; CI-NEXT: s_cbranch_vccnz .LBB10_29
+; CI-NEXT: s_branch .LBB10_31
+; CI-NEXT: .LBB10_30:
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: .LBB10_31: ; %frem.loop_exit100
+; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5
+; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
+; CI-NEXT: v_mul_f32_e32 v6, v5, v6
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v4, v5, v4
+; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: s_and_b32 s10, s8, 0xffff8000
+; CI-NEXT: v_xor_b32_e32 v3, s10, v3
+; CI-NEXT: .LBB10_32: ; %Flow133
+; CI-NEXT: s_and_b32 s4, s4, 0x7fff
+; CI-NEXT: s_and_b32 s4, 0xffff, s4
+; CI-NEXT: s_cmp_eq_u32 s4, 0
+; CI-NEXT: s_cselect_b32 s10, 1, 0
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff
+; CI-NEXT: s_and_b32 s2, 0xffff, s2
+; CI-NEXT: s_cmpk_lt_u32 s2, 0x7c00
+; CI-NEXT: s_cselect_b32 s2, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s4, 0x7c00
+; CI-NEXT: s_cselect_b32 s4, 1, 0
+; CI-NEXT: s_and_b32 s2, s4, s2
+; CI-NEXT: s_and_b32 s4, s7, 0x7fff
+; CI-NEXT: s_and_b32 s4, 0xffff, s4
+; CI-NEXT: s_cmp_eq_u32 s4, 0
+; CI-NEXT: s_cselect_b32 s7, 1, 0
+; CI-NEXT: s_and_b32 s6, s6, 0x7fff
+; CI-NEXT: s_and_b32 s6, 0xffff, s6
+; CI-NEXT: s_cmpk_lt_u32 s6, 0x7c00
+; CI-NEXT: s_cselect_b32 s6, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s4, 0x7c00
+; CI-NEXT: s_cselect_b32 s4, 1, 0
+; CI-NEXT: s_and_b32 s5, s5, 0x7fff
+; CI-NEXT: s_and_b32 s4, s4, s6
+; CI-NEXT: s_and_b32 s5, 0xffff, s5
+; CI-NEXT: s_cmp_eq_u32 s5, 0
+; CI-NEXT: s_cselect_b32 s6, 1, 0
+; CI-NEXT: s_and_b32 s3, s3, 0x7fff
+; CI-NEXT: s_and_b32 s3, 0xffff, s3
+; CI-NEXT: s_cmpk_lt_u32 s3, 0x7c00
+; CI-NEXT: s_cselect_b32 s3, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s5, 0x7c00
+; CI-NEXT: s_cselect_b32 s5, 1, 0
+; CI-NEXT: s_and_b32 s3, s5, s3
+; CI-NEXT: s_and_b32 s5, s9, 0x7fff
+; CI-NEXT: s_and_b32 s5, 0xffff, s5
+; CI-NEXT: s_cmp_eq_u32 s5, 0
+; CI-NEXT: s_cselect_b32 s9, 1, 0
+; CI-NEXT: s_and_b32 s8, s8, 0x7fff
+; CI-NEXT: s_and_b32 s8, 0xffff, s8
+; CI-NEXT: s_cmpk_lt_u32 s8, 0x7c00
+; CI-NEXT: s_cselect_b32 s8, 1, 0
+; CI-NEXT: s_cmpk_le_u32 s5, 0x7c00
+; CI-NEXT: s_cselect_b32 s5, 1, 0
+; CI-NEXT: s_and_b32 s5, s5, s8
+; CI-NEXT: s_and_b32 s8, 1, s10
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
+; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
+; CI-NEXT: s_and_b32 s2, 1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s7
+; CI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s4
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: s_and_b32 s2, 1, s6
; CI-NEXT: v_or_b32_e32 v0, v0, v1
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s3
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s9
+; CI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: s_and_b32 s2, 1, s5
+; CI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_or_b32_e32 v1, v1, v2
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
-; CI-NEXT: v_trunc_f32_e32 v5, v5
-; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; CI-NEXT: v_or_b32_e32 v1, v2, v1
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; VI-NEXT: s_load_dwordx2 s[8:9], s[18:19], 0x0
+; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20
+; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s4
-; VI-NEXT: s_lshr_b32 s8, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: s_lshr_b32 s6, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v2
-; VI-NEXT: s_lshr_b32 s9, s5, 16
-; VI-NEXT: s_lshr_b32 s7, s3, 16
-; VI-NEXT: v_mul_f32_e32 v4, v0, v3
-; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
-; VI-NEXT: v_mac_f32_e32 v4, v5, v3
-; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
-; VI-NEXT: v_mul_f32_e32 v0, v0, v3
-; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; VI-NEXT: v_add_f32_e32 v0, v0, v4
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s8|
+; VI-NEXT: v_cvt_f32_f16_e64 v1, |s10|
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
+; VI-NEXT: s_cbranch_vccz .LBB10_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s0, s8, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_2: ; %Flow144
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_and_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
+; VI-NEXT: v_ldexp_f32 v1, v3, 1
+; VI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v5, v2
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 11
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB10_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v2
+; VI-NEXT: s_cbranch_vccnz .LBB10_5
+; VI-NEXT: s_branch .LBB10_7
+; VI-NEXT: .LBB10_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB10_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
-; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
-; VI-NEXT: v_rcp_f32_e32 v4, v3
-; VI-NEXT: v_mul_f32_e32 v5, v1, v4
-; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
-; VI-NEXT: v_mac_f32_e32 v5, v6, v4
-; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
-; VI-NEXT: v_mul_f32_e32 v1, v1, v4
-; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; VI-NEXT: v_add_f32_e32 v1, v1, v5
+; VI-NEXT: s_and_b32 s0, s8, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v0, s0, v0
+; VI-NEXT: .LBB10_8: ; %Flow145
+; VI-NEXT: s_lshr_b32 s4, s8, 16
+; VI-NEXT: s_lshr_b32 s6, s10, 16
+; VI-NEXT: v_cvt_f32_f16_e64 v3, |s4|
+; VI-NEXT: v_cvt_f32_f16_e64 v2, |s6|
+; VI-NEXT: s_mov_b32 s0, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
+; VI-NEXT: s_cbranch_vccz .LBB10_10
+; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: s_and_b32 s0, s4, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_10: ; %Flow140
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_and_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_16
+; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
+; VI-NEXT: v_ldexp_f32 v2, v4, 1
+; VI-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v3
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 11
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB10_13: ; %frem.loop_body27
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v3
+; VI-NEXT: s_cbranch_vccnz .LBB10_13
+; VI-NEXT: s_branch .LBB10_15
+; VI-NEXT: .LBB10_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT: v_rcp_f32_e32 v5, v4
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_mul_f32_e32 v6, v2, v5
-; VI-NEXT: v_mad_f32 v7, -v4, v6, v2
-; VI-NEXT: v_mac_f32_e32 v6, v7, v5
-; VI-NEXT: v_mad_f32 v2, -v4, v6, v2
-; VI-NEXT: v_mul_f32_e32 v2, v2, v5
-; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; VI-NEXT: v_add_f32_e32 v2, v2, v6
+; VI-NEXT: s_and_b32 s0, s4, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v1, s0, v1
+; VI-NEXT: .LBB10_16: ; %Flow141
+; VI-NEXT: v_cvt_f32_f16_e64 v4, |s9|
+; VI-NEXT: v_cvt_f32_f16_e64 v3, |s11|
+; VI-NEXT: s_mov_b32 s0, 1
+; VI-NEXT: ; implicit-def: $vgpr2
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
+; VI-NEXT: s_cbranch_vccz .LBB10_18
+; VI-NEXT: ; %bb.17: ; %frem.else56
+; VI-NEXT: s_and_b32 s0, s9, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_18: ; %Flow136
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_and_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_24
+; VI-NEXT: ; %bb.19: ; %frem.compute55
+; VI-NEXT: v_frexp_mant_f32_e32 v5, v3
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT: v_ldexp_f32 v3, v5, 1
+; VI-NEXT: v_div_scale_f32 v5, s[0:1], v3, v3, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7
+; VI-NEXT: v_ldexp_f32 v6, v2, 11
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; VI-NEXT: v_rcp_f32_e32 v10, v5
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; VI-NEXT: v_fma_f32 v10, v11, v10, v10
+; VI-NEXT: v_mul_f32_e32 v11, v9, v10
+; VI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; VI-NEXT: v_fma_f32 v11, v12, v10, v11
+; VI-NEXT: v_fma_f32 v5, -v5, v11, v9
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
+; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_22
+; VI-NEXT: ; %bb.20: ; %frem.loop_body63.preheader
+; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
+; VI-NEXT: .LBB10_21: ; %frem.loop_body63
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mul_f32_e32 v6, v7, v5
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT: v_add_f32_e32 v8, v6, v3
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT: v_ldexp_f32 v6, v6, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v4
+; VI-NEXT: s_cbranch_vccnz .LBB10_21
+; VI-NEXT: s_branch .LBB10_23
+; VI-NEXT: .LBB10_22:
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: .LBB10_23: ; %frem.loop_exit64
+; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4
+; VI-NEXT: v_ldexp_f32 v4, v7, v4
+; VI-NEXT: v_mul_f32_e32 v5, v4, v5
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT: v_ldexp_f32 v2, v3, v2
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: v_mov_b32_e32 v4, s9
-; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
-; VI-NEXT: v_trunc_f16_e32 v2, v2
-; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
-; VI-NEXT: v_rcp_f32_e32 v6, v5
-; VI-NEXT: v_mul_f32_e32 v7, v3, v6
-; VI-NEXT: v_mad_f32 v8, -v5, v7, v3
-; VI-NEXT: v_mac_f32_e32 v7, v8, v6
-; VI-NEXT: v_mad_f32 v3, -v5, v7, v3
-; VI-NEXT: v_mul_f32_e32 v3, v3, v6
-; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; VI-NEXT: v_add_f32_e32 v3, v3, v7
+; VI-NEXT: s_and_b32 s0, s9, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v2, s0, v2
+; VI-NEXT: .LBB10_24: ; %Flow137
+; VI-NEXT: s_lshr_b32 s12, s9, 16
+; VI-NEXT: s_lshr_b32 s14, s11, 16
+; VI-NEXT: v_cvt_f32_f16_e64 v5, |s12|
+; VI-NEXT: v_cvt_f32_f16_e64 v4, |s14|
+; VI-NEXT: s_mov_b32 s0, 1
+; VI-NEXT: ; implicit-def: $vgpr3
+; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
+; VI-NEXT: s_cbranch_vccz .LBB10_26
+; VI-NEXT: ; %bb.25: ; %frem.else92
+; VI-NEXT: s_and_b32 s0, s12, 0xffff8000
+; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: .LBB10_26: ; %Flow132
+; VI-NEXT: s_xor_b32 s0, s0, 1
+; VI-NEXT: s_and_b32 s0, s0, 1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc1 .LBB10_32
+; VI-NEXT: ; %bb.27: ; %frem.compute91
+; VI-NEXT: v_frexp_mant_f32_e32 v6, v4
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
+; VI-NEXT: v_ldexp_f32 v4, v6, 1
+; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, 1.0
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v5
+; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v5
+; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8
+; VI-NEXT: v_ldexp_f32 v7, v3, 11
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; VI-NEXT: v_rcp_f32_e32 v11, v6
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; VI-NEXT: v_fma_f32 v11, v12, v11, v11
+; VI-NEXT: v_mul_f32_e32 v12, v10, v11
+; VI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; VI-NEXT: v_fma_f32 v12, v13, v11, v12
+; VI-NEXT: v_fma_f32 v6, -v6, v12, v10
+; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
+; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB10_30
+; VI-NEXT: ; %bb.28: ; %frem.loop_body99.preheader
+; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
+; VI-NEXT: .LBB10_29: ; %frem.loop_body99
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v7, v8, v6
+; VI-NEXT: v_rndne_f32_e32 v7, v7
+; VI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT: v_add_f32_e32 v9, v7, v4
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT: v_ldexp_f32 v7, v7, 11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 11, v5
+; VI-NEXT: s_cbranch_vccnz .LBB10_29
+; VI-NEXT: s_branch .LBB10_31
+; VI-NEXT: .LBB10_30:
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: .LBB10_31: ; %frem.loop_exit100
+; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5
+; VI-NEXT: v_ldexp_f32 v5, v8, v5
+; VI-NEXT: v_mul_f32_e32 v6, v5, v6
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v4, v5, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT: v_ldexp_f32 v3, v4, v3
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
-; VI-NEXT: v_trunc_f16_e32 v3, v3
-; VI-NEXT: v_fma_f16 v3, -v3, v4, s7
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_and_b32 s0, s12, 0xffff8000
+; VI-NEXT: v_xor_b32_e32 v3, s0, v3
+; VI-NEXT: .LBB10_32: ; %Flow133
+; VI-NEXT: v_mov_b32_e32 v5, 0x1f8
+; VI-NEXT: v_cmp_class_f16_e64 s[2:3], s10, 3
+; VI-NEXT: v_mov_b32_e32 v4, 0x60
+; VI-NEXT: v_cmp_class_f16_e64 s[0:1], s8, v5
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: v_cmp_class_f16_e64 s[2:3], s6, v4
+; VI-NEXT: v_cmp_class_f16_e64 s[6:7], s6, 3
+; VI-NEXT: v_cmp_class_f16_e64 s[4:5], s4, v5
+; VI-NEXT: s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s10, v4
+; VI-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; VI-NEXT: v_cmp_class_f16_e64 s[6:7], s11, v4
+; VI-NEXT: v_cmp_class_f16_e64 s[10:11], s11, 3
+; VI-NEXT: v_cmp_class_f16_e64 s[8:9], s9, v5
+; VI-NEXT: s_xor_b64 s[10:11], s[10:11], -1
+; VI-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9]
+; VI-NEXT: v_cmp_class_f16_e64 s[10:11], s14, v4
+; VI-NEXT: v_mov_b32_e32 v4, 0x7e00
+; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; VI-NEXT: v_cmp_class_f16_e64 s[14:15], s14, 3
+; VI-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_cmp_class_f16_e64 s[12:13], s12, v5
+; VI-NEXT: s_xor_b64 s[14:15], s[14:15], -1
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; VI-NEXT: s_and_b64 s[12:13], s[14:15], s[12:13]
+; VI-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[10:11]
+; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7]
+; VI-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[12:13]
+; VI-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[8:9]
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s16
+; VI-NEXT: v_mov_b32_e32 v3, s17
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -791,43 +2998,178 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
-; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
-; CI-NEXT: v_rcp_f32_e32 v3, v1
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB11_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s6, s2, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s6
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB11_2: ; %Flow56
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB11_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT: v_fma_f32 v3, v4, v3, v3
-; CI-NEXT: v_mul_f32_e32 v4, v2, v3
-; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT: v_fma_f32 v4, v5, v3, v4
-; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
-; CI-NEXT: v_trunc_f32_e32 v1, v1
-; CI-NEXT: v_fma_f32 v0, -v1, v0, s2
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB11_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB11_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB11_5
+; CI-NEXT: s_branch .LBB11_7
+; CI-NEXT: .LBB11_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB11_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: s_and_b32 s6, s2, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v0, s6, v0
+; CI-NEXT: .LBB11_8: ; %Flow57
; CI-NEXT: v_mov_b32_e32 v1, s5
-; CI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3
-; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
+; CI-NEXT: s_mov_b32 s6, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB11_10
+; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: s_and_b32 s6, s3, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v2, s5
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
+; CI-NEXT: v_mov_b32_e32 v2, s6
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: .LBB11_10: ; %Flow52
+; CI-NEXT: s_xor_b32 s6, s6, 1
+; CI-NEXT: s_and_b32 s6, s6, 1
+; CI-NEXT: s_cmp_lg_u32 s6, 0
+; CI-NEXT: s_cbranch_scc1 .LBB11_16
+; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
+; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5|
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB11_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB11_13: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT: s_cbranch_vccnz .LBB11_13
+; CI-NEXT: s_branch .LBB11_15
+; CI-NEXT: .LBB11_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT: s_and_b32 s6, s3, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v1, s6, v1
+; CI-NEXT: .LBB11_16: ; %Flow53
+; CI-NEXT: v_mov_b32_e32 v2, 0x60
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s4, v2
+; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v4, 0x1f8
+; CI-NEXT: v_cmp_class_f32_e64 s[6:7], s4, 3
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s2, v4
+; CI-NEXT: s_xor_b64 s[6:7], s[6:7], -1
+; CI-NEXT: s_and_b64 vcc, s[6:7], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s5, v2
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s3, v4
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s5, 3
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v1, -v2, v1, s3
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -836,42 +3178,177 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
-; VI-NEXT: v_rcp_f32_e32 v3, v1
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB11_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s6, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB11_2: ; %Flow56
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_and_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB11_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s2|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s4|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
-; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB11_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB11_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB11_5
+; VI-NEXT: s_branch .LBB11_7
+; VI-NEXT: .LBB11_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB11_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: s_and_b32 s6, s2, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v0, s6, v0
+; VI-NEXT: .LBB11_8: ; %Flow57
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3
-; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
-; VI-NEXT: v_rcp_f32_e32 v4, v2
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s3|, |v1|
+; VI-NEXT: s_mov_b32 s6, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB11_10
+; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: s_and_b32 s6, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: s_mov_b32 s6, 0
+; VI-NEXT: .LBB11_10: ; %Flow52
+; VI-NEXT: s_xor_b32 s6, s6, 1
+; VI-NEXT: s_and_b32 s6, s6, 1
+; VI-NEXT: s_cmp_lg_u32 s6, 0
+; VI-NEXT: s_cbranch_scc1 .LBB11_16
+; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
+; VI-NEXT: v_ldexp_f32 v2, v2, 1
+; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s3|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s3|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s5|
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 12
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; VI-NEXT: v_fma_f32 v4, v5, v4, v4
-; VI-NEXT: v_mul_f32_e32 v5, v3, v4
-; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; VI-NEXT: v_fma_f32 v5, v6, v4, v5
-; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
-; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB11_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB11_13: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT: s_cbranch_vccnz .LBB11_13
+; VI-NEXT: s_branch .LBB11_15
+; VI-NEXT: .LBB11_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
+; VI-NEXT: s_and_b32 s6, s3, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v1, s6, v1
+; VI-NEXT: .LBB11_16: ; %Flow53
+; VI-NEXT: v_mov_b32_e32 v2, 0x60
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v4, 0x1f8
+; VI-NEXT: v_cmp_class_f32_e64 s[6:7], s4, 3
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v4
+; VI-NEXT: s_xor_b64 s[6:7], s[6:7], -1
+; VI-NEXT: s_and_b64 vcc, s[6:7], vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v2
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s3, v4
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s5, 3
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -892,73 +3369,340 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
+; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s8
-; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4
-; CI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4
-; CI-NEXT: v_rcp_f32_e32 v3, v1
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
+; CI-NEXT: ; implicit-def: $vgpr0
+; CI-NEXT: s_cbranch_vccz .LBB12_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: s_and_b32 s2, s4, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v1, s8
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_2: ; %Flow136
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_and_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
+; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
+; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v0, |s4|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v5
+; CI-NEXT: v_ldexp_f32_e64 v4, v0, 12
+; CI-NEXT: v_add_i32_e32 v0, vcc, -1, v6
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
+; CI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; CI-NEXT: v_rcp_f32_e32 v8, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; CI-NEXT: v_fma_f32 v3, v4, v3, v3
-; CI-NEXT: v_mul_f32_e32 v4, v2, v3
-; CI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; CI-NEXT: v_fma_f32 v4, v5, v3, v4
-; CI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; CI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; CI-NEXT: v_fma_f32 v8, v9, v8, v8
+; CI-NEXT: v_mul_f32_e32 v9, v7, v8
+; CI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; CI-NEXT: v_fma_f32 v9, v10, v8, v9
+; CI-NEXT: v_fma_f32 v3, -v3, v9, v7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s4
-; CI-NEXT: v_trunc_f32_e32 v1, v1
-; CI-NEXT: v_fma_f32 v0, -v1, v0, s4
+; CI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
+; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; CI-NEXT: .LBB12_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: v_mul_f32_e32 v4, v5, v3
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v6, v4, v1
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, -12, v2
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; CI-NEXT: s_cbranch_vccnz .LBB12_5
+; CI-NEXT: s_branch .LBB12_7
+; CI-NEXT: .LBB12_6:
+; CI-NEXT: v_mov_b32_e32 v5, v4
+; CI-NEXT: .LBB12_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
+; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
+; CI-NEXT: v_mul_f32_e32 v3, v2, v3
+; CI-NEXT: v_rndne_f32_e32 v3, v3
+; CI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; CI-NEXT: v_add_f32_e32 v1, v2, v1
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; CI-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; CI-NEXT: s_and_b32 s2, s4, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v0, s2, v0
+; CI-NEXT: .LBB12_8: ; %Flow137
; CI-NEXT: v_mov_b32_e32 v1, s9
-; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5
-; CI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5
-; CI-NEXT: v_rcp_f32_e32 v4, v2
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB12_10
+; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: s_and_b32 s2, s5, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v2, s9
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_10: ; %Flow132
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_and_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_16
+; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
+; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
+; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v1, |s5|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9|
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v6
+; CI-NEXT: v_ldexp_f32_e64 v5, v1, 12
+; CI-NEXT: v_add_i32_e32 v1, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v1
+; CI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; CI-NEXT: v_rcp_f32_e32 v9, v4
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; CI-NEXT: v_fma_f32 v4, v5, v4, v4
-; CI-NEXT: v_mul_f32_e32 v5, v3, v4
-; CI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; CI-NEXT: v_fma_f32 v5, v6, v4, v5
-; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; CI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; CI-NEXT: v_fma_f32 v9, v10, v9, v9
+; CI-NEXT: v_mul_f32_e32 v10, v8, v9
+; CI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; CI-NEXT: v_fma_f32 v10, v11, v9, v10
+; CI-NEXT: v_fma_f32 v4, -v4, v10, v8
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s5
-; CI-NEXT: v_trunc_f32_e32 v2, v2
-; CI-NEXT: v_fma_f32 v1, -v2, v1, s5
+; CI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
+; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CI-NEXT: .LBB12_13: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: v_mul_f32_e32 v5, v6, v4
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v7, v5, v2
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT: v_add_i32_e32 v3, vcc, -12, v3
+; CI-NEXT: v_ldexp_f32_e64 v5, v5, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; CI-NEXT: s_cbranch_vccnz .LBB12_13
+; CI-NEXT: s_branch .LBB12_15
+; CI-NEXT: .LBB12_14:
+; CI-NEXT: v_mov_b32_e32 v6, v5
+; CI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
+; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
+; CI-NEXT: v_mul_f32_e32 v4, v3, v4
+; CI-NEXT: v_rndne_f32_e32 v4, v4
+; CI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; CI-NEXT: v_add_f32_e32 v2, v3, v2
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CI-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; CI-NEXT: s_and_b32 s2, s5, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v1, s2, v1
+; CI-NEXT: .LBB12_16: ; %Flow133
; CI-NEXT: v_mov_b32_e32 v2, s10
-; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6
-; CI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6
-; CI-NEXT: v_rcp_f32_e32 v5, v3
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr2
+; CI-NEXT: s_cbranch_vccz .LBB12_18
+; CI-NEXT: ; %bb.17: ; %frem.else50
+; CI-NEXT: s_and_b32 s2, s6, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v3, s10
+; CI-NEXT: v_mov_b32_e32 v2, s6
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_18: ; %Flow128
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_and_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_24
+; CI-NEXT: ; %bb.19: ; %frem.compute49
+; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
+; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1
+; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v2, |s6|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10|
+; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v7
+; CI-NEXT: v_ldexp_f32_e64 v6, v2, 12
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v8
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; CI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; CI-NEXT: v_rcp_f32_e32 v10, v5
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; CI-NEXT: v_fma_f32 v5, v6, v5, v5
-; CI-NEXT: v_mul_f32_e32 v6, v4, v5
-; CI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; CI-NEXT: v_fma_f32 v6, v7, v5, v6
-; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; CI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; CI-NEXT: v_fma_f32 v10, v11, v10, v10
+; CI-NEXT: v_mul_f32_e32 v11, v9, v10
+; CI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; CI-NEXT: v_fma_f32 v11, v12, v10, v11
+; CI-NEXT: v_fma_f32 v5, -v5, v11, v9
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s6
-; CI-NEXT: v_trunc_f32_e32 v3, v3
-; CI-NEXT: v_fma_f32 v2, -v3, v2, s6
+; CI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
+; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_22
+; CI-NEXT: ; %bb.20: ; %frem.loop_body57.preheader
+; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7
+; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
+; CI-NEXT: .LBB12_21: ; %frem.loop_body57
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: v_mul_f32_e32 v6, v7, v5
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; CI-NEXT: v_add_f32_e32 v8, v6, v3
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT: v_add_i32_e32 v4, vcc, -12, v4
+; CI-NEXT: v_ldexp_f32_e64 v6, v6, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4
+; CI-NEXT: s_cbranch_vccnz .LBB12_21
+; CI-NEXT: s_branch .LBB12_23
+; CI-NEXT: .LBB12_22:
+; CI-NEXT: v_mov_b32_e32 v7, v6
+; CI-NEXT: .LBB12_23: ; %frem.loop_exit58
+; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4
+; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
+; CI-NEXT: v_mul_f32_e32 v5, v4, v5
+; CI-NEXT: v_rndne_f32_e32 v5, v5
+; CI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; CI-NEXT: v_add_f32_e32 v3, v4, v3
+; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; CI-NEXT: v_ldexp_f32_e32 v2, v3, v2
+; CI-NEXT: s_and_b32 s2, s6, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v2, s2, v2
+; CI-NEXT: .LBB12_24: ; %Flow129
; CI-NEXT: v_mov_b32_e32 v3, s11
-; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7
-; CI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7
-; CI-NEXT: v_rcp_f32_e32 v6, v4
+; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr3
+; CI-NEXT: s_cbranch_vccz .LBB12_26
+; CI-NEXT: ; %bb.25: ; %frem.else84
+; CI-NEXT: s_and_b32 s2, s7, 0x80000000
+; CI-NEXT: v_mov_b32_e32 v4, s11
+; CI-NEXT: v_mov_b32_e32 v3, s7
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
+; CI-NEXT: v_mov_b32_e32 v4, s2
+; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB12_26: ; %Flow124
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_and_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB12_32
+; CI-NEXT: ; %bb.27: ; %frem.compute83
+; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
+; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1
+; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; CI-NEXT: v_frexp_mant_f32_e64 v3, |s7|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7|
+; CI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11|
+; CI-NEXT: v_add_i32_e32 v5, vcc, -1, v8
+; CI-NEXT: v_ldexp_f32_e64 v7, v3, 12
+; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v9
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v3
+; CI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; CI-NEXT: v_rcp_f32_e32 v11, v6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
-; CI-NEXT: v_fma_f32 v6, v7, v6, v6
-; CI-NEXT: v_mul_f32_e32 v7, v5, v6
-; CI-NEXT: v_fma_f32 v8, -v4, v7, v5
-; CI-NEXT: v_fma_f32 v7, v8, v6, v7
-; CI-NEXT: v_fma_f32 v4, -v4, v7, v5
+; CI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; CI-NEXT: v_fma_f32 v11, v12, v11, v11
+; CI-NEXT: v_mul_f32_e32 v12, v10, v11
+; CI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; CI-NEXT: v_fma_f32 v12, v13, v11, v12
+; CI-NEXT: v_fma_f32 v6, -v6, v12, v10
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
+; CI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
+; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB12_30
+; CI-NEXT: ; %bb.28: ; %frem.loop_body91.preheader
+; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8
+; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
+; CI-NEXT: .LBB12_29: ; %frem.loop_body91
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: v_mul_f32_e32 v7, v8, v6
+; CI-NEXT: v_rndne_f32_e32 v7, v7
+; CI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; CI-NEXT: v_add_f32_e32 v9, v7, v4
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT: v_add_i32_e32 v5, vcc, -12, v5
+; CI-NEXT: v_ldexp_f32_e64 v7, v7, 12
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5
+; CI-NEXT: s_cbranch_vccnz .LBB12_29
+; CI-NEXT: s_branch .LBB12_31
+; CI-NEXT: .LBB12_30:
+; CI-NEXT: v_mov_b32_e32 v8, v7
+; CI-NEXT: .LBB12_31: ; %frem.loop_exit92
+; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5
+; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
+; CI-NEXT: v_mul_f32_e32 v6, v5, v6
+; CI-NEXT: v_rndne_f32_e32 v6, v6
+; CI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; CI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; CI-NEXT: v_add_f32_e32 v4, v5, v4
+; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; CI-NEXT: v_ldexp_f32_e32 v3, v4, v3
+; CI-NEXT: s_and_b32 s2, s7, 0x80000000
+; CI-NEXT: v_xor_b32_e32 v3, s2, v3
+; CI-NEXT: .LBB12_32: ; %Flow125
+; CI-NEXT: v_mov_b32_e32 v4, 0x60
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s8, v4
+; CI-NEXT: v_mov_b32_e32 v5, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v6, 0x1f8
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s8, 3
+; CI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s4, v6
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s9, v4
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s9, 3
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s5, v6
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s10, v4
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s10, 3
+; CI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s6, v6
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s11, v4
+; CI-NEXT: v_cmp_class_f32_e64 s[2:3], s11, 3
+; CI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CI-NEXT: v_cmp_class_f32_e32 vcc, s7, v6
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s7
-; CI-NEXT: v_trunc_f32_e32 v4, v4
-; CI-NEXT: v_fma_f32 v3, -v4, v3, s7
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -969,71 +3713,338 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
+; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4
-; VI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4
-; VI-NEXT: v_rcp_f32_e32 v3, v1
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
+; VI-NEXT: ; implicit-def: $vgpr0
+; VI-NEXT: s_cbranch_vccz .LBB12_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: s_and_b32 s2, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s8
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_2: ; %Flow136
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_and_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
+; VI-NEXT: v_ldexp_f32 v1, v1, 1
+; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v0, |s4|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v5, |s4|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s8|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v5
+; VI-NEXT: v_ldexp_f32 v4, v0, 12
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v6
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v0
+; VI-NEXT: v_div_scale_f32 v7, vcc, 1.0, v1, 1.0
+; VI-NEXT: v_rcp_f32_e32 v8, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
+; VI-NEXT: v_fma_f32 v9, -v3, v8, 1.0
+; VI-NEXT: v_fma_f32 v8, v9, v8, v8
+; VI-NEXT: v_mul_f32_e32 v9, v7, v8
+; VI-NEXT: v_fma_f32 v10, -v3, v9, v7
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v3, -v3, v9, v7
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s4
-; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s4
+; VI-NEXT: v_div_fmas_f32 v3, v3, v8, v9
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
+; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
+; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; VI-NEXT: .LBB12_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: v_mul_f32_e32 v4, v5, v3
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v4, -v4, v1, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v6, v4, v1
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, -12, v2
+; VI-NEXT: v_ldexp_f32 v4, v4, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v2
+; VI-NEXT: s_cbranch_vccnz .LBB12_5
+; VI-NEXT: s_branch .LBB12_7
+; VI-NEXT: .LBB12_6:
+; VI-NEXT: v_mov_b32_e32 v5, v4
+; VI-NEXT: .LBB12_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
+; VI-NEXT: v_ldexp_f32 v2, v5, v2
+; VI-NEXT: v_mul_f32_e32 v3, v2, v3
+; VI-NEXT: v_rndne_f32_e32 v3, v3
+; VI-NEXT: v_fma_f32 v2, -v3, v1, v2
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v2
+; VI-NEXT: v_add_f32_e32 v1, v2, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; VI-NEXT: v_ldexp_f32 v0, v1, v0
+; VI-NEXT: s_and_b32 s2, s4, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v0, s2, v0
+; VI-NEXT: .LBB12_8: ; %Flow137
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5
-; VI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5
-; VI-NEXT: v_rcp_f32_e32 v4, v2
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s5|, |v1|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB12_10
+; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: s_and_b32 s2, s5, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s9
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_10: ; %Flow132
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_and_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_16
+; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
+; VI-NEXT: v_ldexp_f32 v2, v2, 1
+; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v1, |s5|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v6, |s5|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s9|
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v6
+; VI-NEXT: v_ldexp_f32 v5, v1, 12
+; VI-NEXT: v_add_u32_e32 v1, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v1
+; VI-NEXT: v_div_scale_f32 v8, vcc, 1.0, v2, 1.0
+; VI-NEXT: v_rcp_f32_e32 v9, v4
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
-; VI-NEXT: v_fma_f32 v4, v5, v4, v4
-; VI-NEXT: v_mul_f32_e32 v5, v3, v4
-; VI-NEXT: v_fma_f32 v6, -v2, v5, v3
-; VI-NEXT: v_fma_f32 v5, v6, v4, v5
-; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
+; VI-NEXT: v_fma_f32 v10, -v4, v9, 1.0
+; VI-NEXT: v_fma_f32 v9, v10, v9, v9
+; VI-NEXT: v_mul_f32_e32 v10, v8, v9
+; VI-NEXT: v_fma_f32 v11, -v4, v10, v8
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v4, -v4, v10, v8
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s5
-; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s5
+; VI-NEXT: v_div_fmas_f32 v4, v4, v9, v10
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
+; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
+; VI-NEXT: .LBB12_13: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v5, v6, v4
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v5, -v5, v2, v6
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v7, v5, v2
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, -12, v3
+; VI-NEXT: v_ldexp_f32 v5, v5, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v3
+; VI-NEXT: s_cbranch_vccnz .LBB12_13
+; VI-NEXT: s_branch .LBB12_15
+; VI-NEXT: .LBB12_14:
+; VI-NEXT: v_mov_b32_e32 v6, v5
+; VI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
+; VI-NEXT: v_ldexp_f32 v3, v6, v3
+; VI-NEXT: v_mul_f32_e32 v4, v3, v4
+; VI-NEXT: v_rndne_f32_e32 v4, v4
+; VI-NEXT: v_fma_f32 v3, -v4, v2, v3
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; VI-NEXT: v_add_f32_e32 v2, v3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_ldexp_f32 v1, v2, v1
+; VI-NEXT: s_and_b32 s2, s5, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v1, s2, v1
+; VI-NEXT: .LBB12_16: ; %Flow133
; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6
-; VI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6
-; VI-NEXT: v_rcp_f32_e32 v5, v3
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s6|, |v2|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr2
+; VI-NEXT: s_cbranch_vccz .LBB12_18
+; VI-NEXT: ; %bb.17: ; %frem.else50
+; VI-NEXT: s_and_b32 s2, s6, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s10
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_18: ; %Flow128
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_and_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_24
+; VI-NEXT: ; %bb.19: ; %frem.compute49
+; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
+; VI-NEXT: v_ldexp_f32 v3, v3, 1
+; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v2, |s6|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v7, |s6|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s10|
+; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v7
+; VI-NEXT: v_ldexp_f32 v6, v2, 12
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v8
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v2
+; VI-NEXT: v_div_scale_f32 v9, vcc, 1.0, v3, 1.0
+; VI-NEXT: v_rcp_f32_e32 v10, v5
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0
-; VI-NEXT: v_fma_f32 v5, v6, v5, v5
-; VI-NEXT: v_mul_f32_e32 v6, v4, v5
-; VI-NEXT: v_fma_f32 v7, -v3, v6, v4
-; VI-NEXT: v_fma_f32 v6, v7, v5, v6
-; VI-NEXT: v_fma_f32 v3, -v3, v6, v4
+; VI-NEXT: v_fma_f32 v11, -v5, v10, 1.0
+; VI-NEXT: v_fma_f32 v10, v11, v10, v10
+; VI-NEXT: v_mul_f32_e32 v11, v9, v10
+; VI-NEXT: v_fma_f32 v12, -v5, v11, v9
+; VI-NEXT: v_fma_f32 v11, v12, v10, v11
+; VI-NEXT: v_fma_f32 v5, -v5, v11, v9
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
-; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s6
-; VI-NEXT: v_trunc_f32_e32 v3, v3
-; VI-NEXT: v_fma_f32 v2, -v3, v2, s6
+; VI-NEXT: v_div_fmas_f32 v5, v5, v10, v11
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
+; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_22
+; VI-NEXT: ; %bb.20: ; %frem.loop_body57.preheader
+; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
+; VI-NEXT: .LBB12_21: ; %frem.loop_body57
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: v_mul_f32_e32 v6, v7, v5
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v6, -v6, v3, v7
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v6
+; VI-NEXT: v_add_f32_e32 v8, v6, v3
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, -12, v4
+; VI-NEXT: v_ldexp_f32 v6, v6, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4
+; VI-NEXT: s_cbranch_vccnz .LBB12_21
+; VI-NEXT: s_branch .LBB12_23
+; VI-NEXT: .LBB12_22:
+; VI-NEXT: v_mov_b32_e32 v7, v6
+; VI-NEXT: .LBB12_23: ; %frem.loop_exit58
+; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4
+; VI-NEXT: v_ldexp_f32 v4, v7, v4
+; VI-NEXT: v_mul_f32_e32 v5, v4, v5
+; VI-NEXT: v_rndne_f32_e32 v5, v5
+; VI-NEXT: v_fma_f32 v4, -v5, v3, v4
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v4
+; VI-NEXT: v_add_f32_e32 v3, v4, v3
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; VI-NEXT: v_ldexp_f32 v2, v3, v2
+; VI-NEXT: s_and_b32 s2, s6, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v2, s2, v2
+; VI-NEXT: .LBB12_24: ; %Flow129
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7
-; VI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7
-; VI-NEXT: v_rcp_f32_e32 v6, v4
+; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s7|, |v3|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr3
+; VI-NEXT: s_cbranch_vccz .LBB12_26
+; VI-NEXT: ; %bb.25: ; %frem.else84
+; VI-NEXT: s_and_b32 s2, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v4, s11
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB12_26: ; %Flow124
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_and_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB12_32
+; VI-NEXT: ; %bb.27: ; %frem.compute83
+; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
+; VI-NEXT: v_ldexp_f32 v4, v4, 1
+; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
+; VI-NEXT: v_frexp_mant_f32_e64 v3, |s7|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v8, |s7|
+; VI-NEXT: v_frexp_exp_i32_f32_e64 v9, |s11|
+; VI-NEXT: v_add_u32_e32 v5, vcc, -1, v8
+; VI-NEXT: v_ldexp_f32 v7, v3, 12
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v9
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v3
+; VI-NEXT: v_div_scale_f32 v10, vcc, 1.0, v4, 1.0
+; VI-NEXT: v_rcp_f32_e32 v11, v6
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0
-; VI-NEXT: v_fma_f32 v6, v7, v6, v6
-; VI-NEXT: v_mul_f32_e32 v7, v5, v6
-; VI-NEXT: v_fma_f32 v8, -v4, v7, v5
-; VI-NEXT: v_fma_f32 v7, v8, v6, v7
-; VI-NEXT: v_fma_f32 v4, -v4, v7, v5
+; VI-NEXT: v_fma_f32 v12, -v6, v11, 1.0
+; VI-NEXT: v_fma_f32 v11, v12, v11, v11
+; VI-NEXT: v_mul_f32_e32 v12, v10, v11
+; VI-NEXT: v_fma_f32 v13, -v6, v12, v10
+; VI-NEXT: v_fma_f32 v12, v13, v11, v12
+; VI-NEXT: v_fma_f32 v6, -v6, v12, v10
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
-; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7
-; VI-NEXT: v_trunc_f32_e32 v4, v4
-; VI-NEXT: v_fma_f32 v3, -v4, v3, s7
+; VI-NEXT: v_div_fmas_f32 v6, v6, v11, v12
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
+; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB12_30
+; VI-NEXT: ; %bb.28: ; %frem.loop_body91.preheader
+; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8
+; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
+; VI-NEXT: .LBB12_29: ; %frem.loop_body91
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v7, v8, v6
+; VI-NEXT: v_rndne_f32_e32 v7, v7
+; VI-NEXT: v_fma_f32 v7, -v7, v4, v8
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v7
+; VI-NEXT: v_add_f32_e32 v9, v7, v4
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT: v_add_u32_e32 v5, vcc, -12, v5
+; VI-NEXT: v_ldexp_f32 v7, v7, 12
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 12, v5
+; VI-NEXT: s_cbranch_vccnz .LBB12_29
+; VI-NEXT: s_branch .LBB12_31
+; VI-NEXT: .LBB12_30:
+; VI-NEXT: v_mov_b32_e32 v8, v7
+; VI-NEXT: .LBB12_31: ; %frem.loop_exit92
+; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5
+; VI-NEXT: v_ldexp_f32 v5, v8, v5
+; VI-NEXT: v_mul_f32_e32 v6, v5, v6
+; VI-NEXT: v_rndne_f32_e32 v6, v6
+; VI-NEXT: v_fma_f32 v5, -v6, v4, v5
+; VI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; VI-NEXT: v_add_f32_e32 v4, v5, v4
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT: v_ldexp_f32 v3, v4, v3
+; VI-NEXT: s_and_b32 s2, s7, 0x80000000
+; VI-NEXT: v_xor_b32_e32 v3, s2, v3
+; VI-NEXT: .LBB12_32: ; %Flow125
+; VI-NEXT: v_mov_b32_e32 v4, 0x60
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s8, v4
+; VI-NEXT: v_mov_b32_e32 v5, 0x7fc00000
+; VI-NEXT: v_mov_b32_e32 v6, 0x1f8
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s8, 3
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v6
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s9, v4
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s9, 3
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s5, v6
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s10, v4
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s10, 3
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s6, v6
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s11, v4
+; VI-NEXT: v_cmp_class_f32_e64 s[2:3], s11, 3
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s7, v6
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -1054,39 +4065,204 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10
+; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
-; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
-; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
-; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
-; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CI-NEXT: s_cbranch_vccz .LBB13_2
+; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: v_mov_b32_e32 v0, s8
+; CI-NEXT: v_mov_b32_e32 v1, s9
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s4
+; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB13_2: ; %Flow56
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_and_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB13_8
+; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; CI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v6
+; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v7
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v2, v8
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; CI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; CI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB13_6
+; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
+; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
+; CI-NEXT: .LBB13_5: ; %frem.loop_body
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; CI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; CI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT: v_add_i32_e32 v9, vcc, 0xffffffe6, v9
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; CI-NEXT: s_cbranch_vccnz .LBB13_5
+; CI-NEXT: s_branch .LBB13_7
+; CI-NEXT: .LBB13_6:
+; CI-NEXT: v_mov_b32_e32 v7, v5
+; CI-NEXT: v_mov_b32_e32 v6, v4
+; CI-NEXT: .LBB13_7: ; %frem.loop_exit
+; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
+; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; CI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; CI-NEXT: v_xor_b32_e32 v0, s2, v0
+; CI-NEXT: v_xor_b32_e32 v1, s3, v1
+; CI-NEXT: .LBB13_8: ; %Flow57
; CI-NEXT: v_mov_b32_e32 v2, s10
; CI-NEXT: v_mov_b32_e32 v3, s11
-; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
-; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
+; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; CI-NEXT: s_mov_b32 s2, 1
+; CI-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CI-NEXT: s_cbranch_vccz .LBB13_10
+; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: v_mov_b32_e32 v2, s10
+; CI-NEXT: v_mov_b32_e32 v3, s11
+; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_mov_b32_e32 v4, s6
+; CI-NEXT: v_mov_b32_e32 v5, s7
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: .LBB13_10: ; %Flow52
+; CI-NEXT: s_xor_b32 s2, s2, 1
+; CI-NEXT: s_and_b32 s2, s2, 1
+; CI-NEXT: s_cmp_lg_u32 s2, 0
+; CI-NEXT: s_cbranch_scc1 .LBB13_16
+; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
+; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; CI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26
+; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; CI-NEXT: v_add_i32_e32 v4, vcc, -1, v8
+; CI-NEXT: v_add_i32_e32 v10, vcc, -1, v9
+; CI-NEXT: v_sub_i32_e32 v11, vcc, v4, v10
+; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1
+; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; CI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
+; CI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5]
+; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; CI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; CI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; CI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15]
+; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
+; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
+; CI-NEXT: s_cbranch_vccnz .LBB13_14
+; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8
+; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9
+; CI-NEXT: .LBB13_13: ; %frem.loop_body23
+; CI-NEXT: ; =>This Inner Loop Header: Depth=1
+; CI-NEXT: v_mov_b32_e32 v9, v7
+; CI-NEXT: v_mov_b32_e32 v8, v6
+; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
+; CI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7]
+; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3]
+; CI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; CI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
+; CI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT: v_add_i32_e32 v11, vcc, 0xffffffe6, v11
+; CI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11
+; CI-NEXT: s_cbranch_vccnz .LBB13_13
+; CI-NEXT: s_branch .LBB13_15
+; CI-NEXT: .LBB13_14:
+; CI-NEXT: v_mov_b32_e32 v9, v7
+; CI-NEXT: v_mov_b32_e32 v8, v6
+; CI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11
+; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
+; CI-NEXT: s_mov_b32 s2, 0
+; CI-NEXT: s_brev_b32 s3, 1
+; CI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; CI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5]
+; CI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
+; CI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10
+; CI-NEXT: v_xor_b32_e32 v2, s2, v2
+; CI-NEXT: v_xor_b32_e32 v3, s3, v3
+; CI-NEXT: .LBB13_16: ; %Flow53
+; CI-NEXT: v_mov_b32_e32 v4, 0x60
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[8:9], v4
+; CI-NEXT: v_mov_b32_e32 v5, 0x7ff80000
+; CI-NEXT: v_mov_b32_e32 v6, 0x1f8
+; CI-NEXT: v_cmp_class_f64_e64 s[2:3], s[8:9], 3
+; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v6
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[10:11], v4
+; CI-NEXT: v_cmp_class_f64_e64 s[2:3], s[10:11], 3
+; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CI-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v6
+; CI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; CI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_mov_b32 s3, 0xf000
-; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
-; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
-; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
-; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
@@ -1097,38 +4273,203 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40
+; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5]
-; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5]
-; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5]
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_cbranch_vccz .LBB13_2
+; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB13_2: ; %Flow56
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_and_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB13_8
+; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
+; VI-NEXT: v_ldexp_f64 v[4:5], v[0:1], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[8:9]|
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v6
+; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v7
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v2, v8
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 1
+; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0
+; VI-NEXT: v_div_scale_f64 v[14:15], vcc, 1.0, v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[10:11], v[2:3]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_fma_f64 v[12:13], -v[2:3], v[10:11], 1.0
+; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; VI-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[12:13], v[14:15]
+; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[10:11], v[12:13]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB13_6
+; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
+; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
+; VI-NEXT: .LBB13_5: ; %frem.loop_body
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[0:1], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[10:11], v[4:5], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT: v_add_u32_e32 v9, vcc, 0xffffffe6, v9
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v9
+; VI-NEXT: s_cbranch_vccnz .LBB13_5
+; VI-NEXT: s_branch .LBB13_7
+; VI-NEXT: .LBB13_6:
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: .LBB13_7: ; %frem.loop_exit
+; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
+; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_mul_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_rndne_f64_e32 v[2:3], v[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[0:1], v[4:5]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], v8
+; VI-NEXT: v_xor_b32_e32 v0, s2, v0
+; VI-NEXT: v_xor_b32_e32 v1, s3, v1
+; VI-NEXT: .LBB13_8: ; %Flow57
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7]
-; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7]
-; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
-; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
-; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7]
-; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
-; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7]
-; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7]
+; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; VI-NEXT: s_mov_b32 s2, 1
+; VI-NEXT: ; implicit-def: $vgpr2_vgpr3
+; VI-NEXT: s_cbranch_vccz .LBB13_10
+; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: .LBB13_10: ; %Flow52
+; VI-NEXT: s_xor_b32 s2, s2, 1
+; VI-NEXT: s_and_b32 s2, s2, 1
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cbranch_scc1 .LBB13_16
+; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
+; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
+; VI-NEXT: v_ldexp_f64 v[6:7], v[2:3], 26
+; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[10:11]|
+; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v8
+; VI-NEXT: v_add_u32_e32 v10, vcc, -1, v9
+; VI-NEXT: v_sub_u32_e32 v11, vcc, v4, v10
+; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], 1
+; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], 1.0
+; VI-NEXT: v_div_scale_f64 v[16:17], vcc, 1.0, v[2:3], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[12:13], v[4:5]
+; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; VI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[12:13], 1.0
+; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
+; VI-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17]
+; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[12:13], v[14:15]
+; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
+; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
+; VI-NEXT: s_cbranch_vccnz .LBB13_14
+; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8
+; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9
+; VI-NEXT: .LBB13_13: ; %frem.loop_body23
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
+; VI-NEXT: v_rndne_f64_e32 v[6:7], v[6:7]
+; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[2:3], v[8:9]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_add_f64 v[12:13], v[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; VI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
+; VI-NEXT: v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT: v_add_u32_e32 v11, vcc, 0xffffffe6, v11
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, 26, v11
+; VI-NEXT: s_cbranch_vccnz .LBB13_13
+; VI-NEXT: s_branch .LBB13_15
+; VI-NEXT: .LBB13_14:
+; VI-NEXT: v_mov_b32_e32 v9, v7
+; VI-NEXT: v_mov_b32_e32 v8, v6
+; VI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11
+; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
+; VI-NEXT: s_mov_b32 s2, 0
+; VI-NEXT: s_brev_b32 s3, 1
+; VI-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT: v_mul_f64 v[4:5], v[6:7], v[4:5]
+; VI-NEXT: v_rndne_f64_e32 v[4:5], v[4:5]
+; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[2:3], v[6:7]
+; VI-NEXT: v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; VI-NEXT: v_ldexp_f64 v[2:3], v[2:3], v10
+; VI-NEXT: v_xor_b32_e32 v2, s2, v2
+; VI-NEXT: v_xor_b32_e32 v3, s3, v3
+; VI-NEXT: .LBB13_16: ; %Flow53
+; VI-NEXT: v_mov_b32_e32 v4, 0x60
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[8:9], v4
+; VI-NEXT: v_mov_b32_e32 v5, 0x7ff80000
+; VI-NEXT: v_mov_b32_e32 v6, 0x1f8
+; VI-NEXT: v_cmp_class_f64_e64 s[2:3], s[8:9], 3
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v6
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[10:11], v4
+; VI-NEXT: v_cmp_class_f64_e64 s[2:3], s[10:11], 3
+; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; VI-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v6
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
+; VI-NEXT: s_and_b64 vcc, s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4962254545c3c..0d1ed8068b9b5 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2570,21 +2570,86 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0
-; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0
+; GFX1032-NEXT: v_cmp_ngt_f32_e64 s1, v0, |s0|
+; GFX1032-NEXT: s_and_saveexec_b32 s2, s1
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s2
+; GFX1032-NEXT: ; %bb.1: ; %frem.else
+; GFX1032-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1032-NEXT: v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1032-NEXT: ; %bb.2: ; %Flow13
+; GFX1032-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032-NEXT: s_cbranch_execz .LBB51_8
+; GFX1032-NEXT: ; %bb.3: ; %frem.compute
+; GFX1032-NEXT: v_frexp_mant_f32_e64 v1, |s0|
+; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1032-NEXT: v_frexp_mant_f32_e32 v8, v0
+; GFX1032-NEXT: v_ldexp_f32 v1, v1, 1
+; GFX1032-NEXT: v_div_scale_f32 v2, s2, v1, v1, 1.0
+; GFX1032-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0
+; GFX1032-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1032-NEXT: v_fma_f32 v4, -v2, v3, 1.0
+; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v3
+; GFX1032-NEXT: v_mul_f32_e32 v4, v5, v3
+; GFX1032-NEXT: v_fma_f32 v6, -v2, v4, v5
+; GFX1032-NEXT: v_fmac_f32_e32 v4, v6, v3
+; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v6, s0
+; GFX1032-NEXT: v_fma_f32 v5, -v2, v4, v5
+; GFX1032-NEXT: v_add_nc_u32_e32 v2, -1, v6
+; GFX1032-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX1032-NEXT: v_xad_u32 v4, v2, -1, v7
+; GFX1032-NEXT: v_ldexp_f32 v5, v8, 12
+; GFX1032-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 12, v4
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB51_7
+; GFX1032-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1032-NEXT: v_sub_nc_u32_e32 v4, v7, v6
+; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, 12, v4
+; GFX1032-NEXT: .LBB51_5: ; %frem.loop_body
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_mov_b32_e32 v6, v5
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, -12, v4
+; GFX1032-NEXT: v_mul_f32_e32 v5, v6, v3
+; GFX1032-NEXT: v_rndne_f32_e32 v5, v5
+; GFX1032-NEXT: v_fma_f32 v5, -v5, v1, v6
+; GFX1032-NEXT: v_add_f32_e32 v7, v5, v1
+; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1032-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
+; GFX1032-NEXT: v_ldexp_f32 v5, v5, 12
+; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_cbranch_execnz .LBB51_5
+; GFX1032-NEXT: ; %bb.6: ; %Flow
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: v_mov_b32_e32 v5, v6
+; GFX1032-NEXT: .LBB51_7: ; %Flow12
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, -11, v4
+; GFX1032-NEXT: v_ldexp_f32 v4, v5, v4
+; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX1032-NEXT: v_rndne_f32_e32 v3, v3
+; GFX1032-NEXT: v_fma_f32 v3, -v3, v1, v4
+; GFX1032-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1032-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX1032-NEXT: v_and_b32_e32 v2, 0x80000000, v0
+; GFX1032-NEXT: v_xor_b32_e32 v1, v2, v1
+; GFX1032-NEXT: .LBB51_8: ; %Flow14
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: v_cmp_class_f32_e64 s1, s0, 3
+; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s0, 0x60
+; GFX1032-NEXT: v_cmp_class_f32_e64 s2, v0, 0x1f8
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v1, 0x7fc00000, s0
+; GFX1032-NEXT: s_xor_b32 s0, s1, -1
; GFX1032-NEXT: s_brev_b32 s1, 1
-; GFX1032-NEXT: v_rcp_f32_e32 v2, v1
-; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX1032-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX1032-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX1032-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3
-; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0
-; GFX1032-NEXT: v_trunc_f32_e32 v1, v1
-; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0
+; GFX1032-NEXT: s_and_b32 vcc_lo, s0, s2
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
@@ -2593,29 +2658,94 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
-; GFX1032-NEXT: ; %bb.1: ; %if.then
+; GFX1032-NEXT: ; %bb.9: ; %if.then
; GFX1032-NEXT: ; divergent unreachable
-; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX1032-NEXT: ; %bb.10: ; %UnifiedReturnBlock
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: fcmp64:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28
+; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x28
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0
-; GFX1064-NEXT: v_rcp_f32_e32 v2, v1
-; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0
-; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0
-; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
-; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0
+; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, |s6|
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[2:3]
+; GFX1064-NEXT: ; %bb.1: ; %frem.else
+; GFX1064-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1064-NEXT: v_cmp_eq_f32_e64 vcc, v0, |s6|
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX1064-NEXT: ; %bb.2: ; %Flow13
+; GFX1064-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB51_8
+; GFX1064-NEXT: ; %bb.3: ; %frem.compute
+; GFX1064-NEXT: v_frexp_mant_f32_e64 v1, |s6|
+; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1064-NEXT: v_frexp_mant_f32_e32 v8, v0
+; GFX1064-NEXT: v_ldexp_f32 v1, v1, 1
+; GFX1064-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, 1.0
+; GFX1064-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0
+; GFX1064-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1064-NEXT: v_fma_f32 v4, -v2, v3, 1.0
+; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v3
+; GFX1064-NEXT: v_mul_f32_e32 v4, v5, v3
+; GFX1064-NEXT: v_fma_f32 v6, -v2, v4, v5
+; GFX1064-NEXT: v_fmac_f32_e32 v4, v6, v3
+; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v6, s6
+; GFX1064-NEXT: v_fma_f32 v5, -v2, v4, v5
+; GFX1064-NEXT: v_add_nc_u32_e32 v2, -1, v6
+; GFX1064-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX1064-NEXT: v_xad_u32 v4, v2, -1, v7
+; GFX1064-NEXT: v_ldexp_f32 v5, v8, 12
+; GFX1064-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB51_7
+; GFX1064-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1064-NEXT: v_sub_nc_u32_e32 v4, v7, v6
+; GFX1064-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, 12, v4
+; GFX1064-NEXT: .LBB51_5: ; %frem.loop_body
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_mov_b32_e32 v6, v5
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, -12, v4
+; GFX1064-NEXT: v_mul_f32_e32 v5, v6, v3
+; GFX1064-NEXT: v_rndne_f32_e32 v5, v5
+; GFX1064-NEXT: v_fma_f32 v5, -v5, v1, v6
+; GFX1064-NEXT: v_add_f32_e32 v7, v5, v1
+; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX1064-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4
+; GFX1064-NEXT: v_ldexp_f32 v5, v5, 12
+; GFX1064-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_cbranch_execnz .LBB51_5
+; GFX1064-NEXT: ; %bb.6: ; %Flow
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v6
+; GFX1064-NEXT: .LBB51_7: ; %Flow12
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, -11, v4
+; GFX1064-NEXT: v_ldexp_f32 v4, v5, v4
+; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX1064-NEXT: v_rndne_f32_e32 v3, v3
+; GFX1064-NEXT: v_fma_f32 v3, -v3, v1, v4
+; GFX1064-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX1064-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX1064-NEXT: v_and_b32_e32 v2, 0x80000000, v0
+; GFX1064-NEXT: v_xor_b32_e32 v1, v2, v1
+; GFX1064-NEXT: .LBB51_8: ; %Flow14
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 0x60
+; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s6, 3
+; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], v0, 0x1f8
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v1, 0x7fc00000, s[4:5]
+; GFX1064-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; GFX1064-NEXT: s_and_b64 vcc, s[0:1], s[2:3]
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc
; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
@@ -2625,9 +2755,9 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX1064-NEXT: ; %bb.1: ; %if.then
+; GFX1064-NEXT: ; %bb.9: ; %if.then
; GFX1064-NEXT: ; divergent unreachable
-; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX1064-NEXT: ; %bb.10: ; %UnifiedReturnBlock
; GFX1064-NEXT: s_endpgm
entry:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -2756,20 +2886,85 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28
; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0
-; GFX1032-NEXT: v_rcp_f32_e32 v2, v1
-; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX1032-NEXT: v_div_scale_f32 v3, vcc_lo, v0, s0, v0
-; GFX1032-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX1032-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX1032-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX1032-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0
-; GFX1032-NEXT: v_trunc_f32_e32 v1, v1
-; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0
+; GFX1032-NEXT: v_cmp_ngt_f32_e64 s1, v0, |s0|
+; GFX1032-NEXT: s_and_saveexec_b32 s2, s1
+; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s2
+; GFX1032-NEXT: ; %bb.1: ; %frem.else
+; GFX1032-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1032-NEXT: v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX1032-NEXT: ; %bb.2: ; %Flow13
+; GFX1032-NEXT: s_andn2_saveexec_b32 s1, s1
+; GFX1032-NEXT: s_cbranch_execz .LBB53_8
+; GFX1032-NEXT: ; %bb.3: ; %frem.compute
+; GFX1032-NEXT: v_frexp_mant_f32_e64 v1, |s0|
+; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1032-NEXT: v_frexp_mant_f32_e32 v8, v0
+; GFX1032-NEXT: v_ldexp_f32 v1, v1, 1
+; GFX1032-NEXT: v_div_scale_f32 v2, s2, v1, v1, 1.0
+; GFX1032-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0
+; GFX1032-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1032-NEXT: v_fma_f32 v4, -v2, v3, 1.0
+; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v3
+; GFX1032-NEXT: v_mul_f32_e32 v4, v5, v3
+; GFX1032-NEXT: v_fma_f32 v6, -v2, v4, v5
+; GFX1032-NEXT: v_fmac_f32_e32 v4, v6, v3
+; GFX1032-NEXT: v_frexp_exp_i32_f32_e32 v6, s0
+; GFX1032-NEXT: v_fma_f32 v5, -v2, v4, v5
+; GFX1032-NEXT: v_add_nc_u32_e32 v2, -1, v6
+; GFX1032-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX1032-NEXT: v_xad_u32 v4, v2, -1, v7
+; GFX1032-NEXT: v_ldexp_f32 v5, v8, 12
+; GFX1032-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 12, v4
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB53_7
+; GFX1032-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1032-NEXT: v_sub_nc_u32_e32 v4, v7, v6
+; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, 12, v4
+; GFX1032-NEXT: .LBB53_5: ; %frem.loop_body
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_mov_b32_e32 v6, v5
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, -12, v4
+; GFX1032-NEXT: v_mul_f32_e32 v5, v6, v3
+; GFX1032-NEXT: v_rndne_f32_e32 v5, v5
+; GFX1032-NEXT: v_fma_f32 v5, -v5, v1, v6
+; GFX1032-NEXT: v_add_f32_e32 v7, v5, v1
+; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5
+; GFX1032-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
+; GFX1032-NEXT: v_ldexp_f32 v5, v5, 12
+; GFX1032-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: s_cbranch_execnz .LBB53_5
+; GFX1032-NEXT: ; %bb.6: ; %Flow
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX1032-NEXT: v_mov_b32_e32 v5, v6
+; GFX1032-NEXT: .LBB53_7: ; %Flow12
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: v_add_nc_u32_e32 v4, -11, v4
+; GFX1032-NEXT: v_ldexp_f32 v4, v5, v4
+; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX1032-NEXT: v_rndne_f32_e32 v3, v3
+; GFX1032-NEXT: v_fma_f32 v3, -v3, v1, v4
+; GFX1032-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX1032-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX1032-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX1032-NEXT: v_and_b32_e32 v2, 0x80000000, v0
+; GFX1032-NEXT: v_xor_b32_e32 v1, v2, v1
+; GFX1032-NEXT: .LBB53_8: ; %Flow14
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: v_cmp_class_f32_e64 s1, s0, 3
+; GFX1032-NEXT: v_cmp_class_f32_e64 s0, s0, 0x60
+; GFX1032-NEXT: v_cmp_class_f32_e64 s2, v0, 0x1f8
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v1, 0x7fc00000, s0
+; GFX1032-NEXT: s_xor_b32 s0, s1, -1
+; GFX1032-NEXT: s_and_b32 vcc_lo, s0, s2
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
@@ -2779,29 +2974,94 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1032-NEXT: s_and_saveexec_b32 s1, s0
-; GFX1032-NEXT: ; %bb.1: ; %if.then
+; GFX1032-NEXT: ; %bb.9: ; %if.then
; GFX1032-NEXT: ; divergent unreachable
-; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX1032-NEXT: ; %bb.10: ; %UnifiedReturnBlock
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: fcmp32:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28
+; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x28
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0
-; GFX1064-NEXT: v_rcp_f32_e32 v2, v1
-; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0
-; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0
-; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
-; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0
+; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[0:1], v0, |s6|
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[2:3]
+; GFX1064-NEXT: ; %bb.1: ; %frem.else
+; GFX1064-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1064-NEXT: v_cmp_eq_f32_e64 vcc, v0, |s6|
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
+; GFX1064-NEXT: ; %bb.2: ; %Flow13
+; GFX1064-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX1064-NEXT: s_cbranch_execz .LBB53_8
+; GFX1064-NEXT: ; %bb.3: ; %frem.compute
+; GFX1064-NEXT: v_frexp_mant_f32_e64 v1, |s6|
+; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
+; GFX1064-NEXT: v_frexp_mant_f32_e32 v8, v0
+; GFX1064-NEXT: v_ldexp_f32 v1, v1, 1
+; GFX1064-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, 1.0
+; GFX1064-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0
+; GFX1064-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1064-NEXT: v_fma_f32 v4, -v2, v3, 1.0
+; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v3
+; GFX1064-NEXT: v_mul_f32_e32 v4, v5, v3
+; GFX1064-NEXT: v_fma_f32 v6, -v2, v4, v5
+; GFX1064-NEXT: v_fmac_f32_e32 v4, v6, v3
+; GFX1064-NEXT: v_frexp_exp_i32_f32_e32 v6, s6
+; GFX1064-NEXT: v_fma_f32 v5, -v2, v4, v5
+; GFX1064-NEXT: v_add_nc_u32_e32 v2, -1, v6
+; GFX1064-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX1064-NEXT: v_xad_u32 v4, v2, -1, v7
+; GFX1064-NEXT: v_ldexp_f32 v5, v8, 12
+; GFX1064-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 12, v4
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB53_7
+; GFX1064-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1064-NEXT: v_sub_nc_u32_e32 v4, v7, v6
+; GFX1064-NEXT: s_mov_b64 s[4:5], 0
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, 12, v4
+; GFX1064-NEXT: .LBB53_5: ; %frem.loop_body
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_mov_b32_e32 v6, v5
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, -12, v4
+; GFX1064-NEXT: v_mul_f32_e32 v5, v6, v3
+; GFX1064-NEXT: v_rndne_f32_e32 v5, v5
+; GFX1064-NEXT: v_fma_f32 v5, -v5, v1, v6
+; GFX1064-NEXT: v_add_f32_e32 v7, v5, v1
+; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v5
+; GFX1064-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 13, v4
+; GFX1064-NEXT: v_ldexp_f32 v5, v5, 12
+; GFX1064-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_cbranch_execnz .LBB53_5
+; GFX1064-NEXT: ; %bb.6: ; %Flow
+; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v6
+; GFX1064-NEXT: .LBB53_7: ; %Flow12
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: v_add_nc_u32_e32 v4, -11, v4
+; GFX1064-NEXT: v_ldexp_f32 v4, v5, v4
+; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX1064-NEXT: v_rndne_f32_e32 v3, v3
+; GFX1064-NEXT: v_fma_f32 v3, -v3, v1, v4
+; GFX1064-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX1064-NEXT: v_cmp_gt_f32_e32 vcc, 0, v3
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX1064-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX1064-NEXT: v_and_b32_e32 v2, 0x80000000, v0
+; GFX1064-NEXT: v_xor_b32_e32 v1, v2, v1
+; GFX1064-NEXT: .LBB53_8: ; %Flow14
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 0x60
+; GFX1064-NEXT: v_cmp_class_f32_e64 s[0:1], s6, 3
+; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], v0, 0x1f8
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v1, 0x7fc00000, s[4:5]
+; GFX1064-NEXT: s_xor_b64 s[0:1], s[0:1], -1
+; GFX1064-NEXT: s_and_b64 vcc, s[0:1], s[2:3]
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc
; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1
; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
@@ -2811,9 +3071,9 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX1064-NEXT: ; %bb.1: ; %if.then
+; GFX1064-NEXT: ; %bb.9: ; %if.then
; GFX1064-NEXT: ; divergent unreachable
-; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX1064-NEXT: ; %bb.10: ; %UnifiedReturnBlock
; GFX1064-NEXT: s_endpgm
entry:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
>From 193957c97f75009fe5801f0291f67ea592785c64 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 12 Mar 2025 12:35:42 -0400
Subject: [PATCH 2/4] Adjust some comments, remove include
---
llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 31d3779eb7c9f..0bef9abc0eac3 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -22,7 +22,6 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/FMF.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/PassManager.h"
@@ -138,9 +137,8 @@ class FRemExpander {
const Twine &PowName) const {
// Build:
// ExName = BUILTIN_FREXP_EXP_ComputeFpTy(Src) - 1;
- // PowName =
- // BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ExName),
- // NewExp);
+ // PowName = BUILTIN_FLDEXP_ComputeFpTy(
+ // BUILTIN_FREXP_MANT_ComputeFpTy(ExName), NewExp);
Type *Ty = Src->getType();
Type *ExTy = B.getInt32Ty();
Value *Frexp = B.CreateIntrinsic(Intrinsic::frexp, {Ty, ExTy}, Src);
@@ -161,10 +159,11 @@ class FRemExpander {
PHINode *RetPhi) const {
// Build:
// ex = BUILTIN_FREXP_EXP_ComputeFpTy(ax) - 1;
- // ax = BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ax),
- // bits); ey = BUILTIN_FREXP_EXP_ComputeFpTy(ay) - 1; ay =
- // BUILTIN_FLDEXP_ComputeFpTy(BUILTIN_FREXP_MANT_ComputeFpTy(ay), 1); auto
- // [Ax, Ex]{getFrexpResults(B, AxInitial)};
+ // ax = BUILTIN_FLDEXP_ComputeFpTy(
+ // BUILTIN_FREXP_MANT_ComputeFpTy(ax), bits);
+ // ey = BUILTIN_FREXP_EXP_ComputeFpTy(ay) - 1;
+ // ay = BUILTIN_FLDEXP_ComputeFpTy(
+ // BUILTIN_FREXP_MANT_ComputeFpTy(ay), 1);
auto [Ax, Ex] = buildExpAndPower(AxInitial, Bits, "ex", "ax");
auto [Ay, Ey] = buildExpAndPower(AyInitial, One, "ey", "ay");
@@ -218,7 +217,7 @@ class FRemExpander {
AxPhiExit, B.CreateAdd(B.CreateSub(NbExitPhi, Bits), One), "ax");
AxFinal = buildUpdateAx(AxFinal, Ay, Ayinv);
- // Adjust exponent and sign
+ // Build:
// ax = BUILTIN_FLDEXP_ComputeFpTy(ax, ey);
// ret = AS_FLOAT((AS_INT(x) & SIGNBIT_SP32) ^ AS_INT(ax));
AxFinal = createLdexp(AxFinal, Ey, "ax");
@@ -257,7 +256,8 @@ class FRemExpander {
// Build:
// ret = y == 0.0f ? QNAN_ComputeFpTy : ret;
// bool c = !BUILTIN_ISNAN_ComputeFpTy(y) &&
- // BUILTIN_ISFINITE_ComputeFpTy(x); ret = c ? ret : QNAN_ComputeFpTy;
+ // BUILTIN_ISFINITE_ComputeFpTy(x);
+ // ret = c ? ret : QNAN_ComputeFpTy;
// TODO Handle NaN and infinity fast math flags separately here?
Value *Nan = ConstantFP::getQNaN(FremTy);
>From 58a2edb4bae94e19b476802a56a02c051abd2a62 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Wed, 12 Mar 2025 12:38:28 -0400
Subject: [PATCH 3/4] clang-format changes
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b2b136c984bf4..ab955e7eee2a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,7 +388,6 @@ class AMDGPUTargetLowering : public TargetLowering {
return MVT::i32;
}
bool shouldExpandFRemInIR() const override { return true; };
-
};
namespace AMDGPUISD {
>From f3f0a8f41da3ad9d3ad3ff1e1b5abde50a69967c Mon Sep 17 00:00:00 2001
From: Frederik Harwath <frederik at harwath.name>
Date: Thu, 13 Mar 2025 10:53:52 +0100
Subject: [PATCH 4/4] Apply suggestions from code review
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 0bef9abc0eac3..fa9bfab03bdf7 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -387,7 +387,7 @@ static bool expandFRem(BinaryOperator &I) {
if (ReturnTy->isFloatingPointTy())
Ret = Expander->buildFRem(I.getOperand(0), I.getOperand(1));
else {
- auto VecTy = cast<FixedVectorType>(ReturnTy);
+ auto *VecTy = cast<FixedVectorType>(ReturnTy);
// This could use SplitBlockAndInsertForEachLane but the interface
// is a bit awkward for a constant number of elements and it will
@@ -406,8 +406,7 @@ static bool expandFRem(BinaryOperator &I) {
I.replaceAllUsesWith(Ret);
Ret->takeName(&I);
- I.removeFromParent();
- I.dropAllReferences();
+ I.eraseFromParent();
return true;
}
@@ -1036,7 +1035,7 @@ static bool runImpl(Function &F, const TargetLowering &TLI) {
while (!Replace.empty()) {
Instruction *I = Replace.pop_back_val();
if (I->getOpcode() == Instruction::FRem)
- expandFRem(llvm::cast<BinaryOperator>(*I));
+ expandFRem(cast<BinaryOperator>(*I));
else if (I->getOpcode() == Instruction::FPToUI ||
I->getOpcode() == Instruction::FPToSI) {
expandFPToI(I);
More information about the llvm-commits
mailing list