[llvm] d21b315 - [RISCV] Remove vmerges from vector ceil, floor, trunc lowering.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 30 11:01:44 PDT 2022
Author: Craig Topper
Date: 2022-07-30T10:58:41-07:00
New Revision: d21b315360608c7fc776a8cf09dc25c10da0240f
URL: https://github.com/llvm/llvm-project/commit/d21b315360608c7fc776a8cf09dc25c10da0240f
DIFF: https://github.com/llvm/llvm-project/commit/d21b315360608c7fc776a8cf09dc25c10da0240f.diff
LOG: [RISCV] Remove vmerges from vector ceil, floor, trunc lowering.
Use masked operations to suppress spurious exception bits being
set in fflags. Unfortunately, doing this adds extra copies.
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f9119884613f4..59344a1396f15 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1815,59 +1815,94 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
// correct.
// TODO: Floor and ceil could be shorter by changing rounding mode, but we don't
// have FRM dependencies modeled yet.
-static SDValue lowerFTRUNC_FCEIL_FFLOOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue lowerFTRUNC_FCEIL_FFLOOR(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
assert(VT.isVector() && "Unexpected type");
SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+ Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+ }
+
+ SDValue TrueMask, VL;
+ std::tie(TrueMask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
// Freeze the source since we are increasing the number of uses.
- SDValue Src = DAG.getFreeze(Op.getOperand(0));
+ Src = DAG.getFreeze(Src);
- // Truncate to integer and convert back to FP.
- MVT IntVT = VT.changeVectorElementTypeToInteger();
- SDValue Truncated = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, Src);
- Truncated = DAG.getNode(ISD::SINT_TO_FP, DL, VT, Truncated);
+ // We do the conversion on the absolute value and fix the sign at the end.
+ SDValue Abs =
+ DAG.getNode(RISCVISD::FABS_VL, DL, ContainerVT, Src, TrueMask, VL);
- MVT SetccVT = MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
+ // Determine the largest integer that can be represented exactly. This and
+ // values larger than it don't have any fractional bits so don't need to
+ // be converted.
+ const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(ContainerVT);
+ unsigned Precision = APFloat::semanticsPrecision(FltSem);
+ APFloat MaxVal = APFloat(FltSem);
+ MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
+ /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
+ SDValue MaxValNode =
+ DAG.getConstantFP(MaxVal, DL, ContainerVT.getVectorElementType());
+ SDValue MaxValSplat = DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), MaxValNode, VL);
+
+ // If abs(Src) was larger than MaxVal or nan, keep it.
+ MVT SetccVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount());
+ SDValue Mask = DAG.getNode(RISCVISD::SETCC_VL, DL, SetccVT, Abs, MaxValSplat,
+ DAG.getCondCode(ISD::SETOLT), TrueMask, VL);
+
+ // Truncate to integer and convert back to FP.
+ MVT IntVT = ContainerVT.changeVectorElementTypeToInteger();
+ SDValue Truncated =
+ DAG.getNode(RISCVISD::FP_TO_SINT_VL, DL, IntVT, Src, Mask, VL);
+ Truncated = DAG.getNode(RISCVISD::SINT_TO_FP_VL, DL, ContainerVT, Truncated,
+ Mask, VL);
if (Op.getOpcode() == ISD::FCEIL) {
// If the truncated value is the greater than or equal to the original
// value, we've computed the ceil. Otherwise, we went the wrong way and
// need to increase by 1.
// FIXME: This should use a masked operation. Handle here or in isel?
- SDValue Adjust = DAG.getNode(ISD::FADD, DL, VT, Truncated,
- DAG.getConstantFP(1.0, DL, VT));
- SDValue NeedAdjust = DAG.getSetCC(DL, SetccVT, Truncated, Src, ISD::SETOLT);
- Truncated = DAG.getSelect(DL, VT, NeedAdjust, Adjust, Truncated);
+ SDValue SplatVal =
+ DAG.getConstantFP(1.0, DL, ContainerVT.getVectorElementType());
+ SDValue Splat = DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), SplatVal, VL);
+ SDValue NeedAdjust =
+ DAG.getNode(RISCVISD::SETCC_VL, DL, SetccVT, Truncated, Src,
+ DAG.getCondCode(ISD::SETOLT), Mask, VL);
+ Truncated = DAG.getNode(RISCVISD::FADD_VL, DL, ContainerVT, Truncated,
+ Splat, Truncated, NeedAdjust, VL);
} else if (Op.getOpcode() == ISD::FFLOOR) {
// If the truncated value is the less than or equal to the original value,
// we've computed the floor. Otherwise, we went the wrong way and need to
// decrease by 1.
// FIXME: This should use a masked operation. Handle here or in isel?
- SDValue Adjust = DAG.getNode(ISD::FSUB, DL, VT, Truncated,
- DAG.getConstantFP(1.0, DL, VT));
- SDValue NeedAdjust = DAG.getSetCC(DL, SetccVT, Truncated, Src, ISD::SETOGT);
- Truncated = DAG.getSelect(DL, VT, NeedAdjust, Adjust, Truncated);
+ SDValue SplatVal =
+ DAG.getConstantFP(1.0, DL, ContainerVT.getVectorElementType());
+ SDValue Splat = DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), SplatVal, VL);
+ SDValue NeedAdjust =
+ DAG.getNode(RISCVISD::SETCC_VL, DL, SetccVT, Src, Truncated,
+ DAG.getCondCode(ISD::SETOLT), Mask, VL);
+ Truncated = DAG.getNode(RISCVISD::FSUB_VL, DL, ContainerVT, Truncated,
+ Splat, Truncated, NeedAdjust, VL);
}
// Restore the original sign so that -0.0 is preserved.
- Truncated = DAG.getNode(ISD::FCOPYSIGN, DL, VT, Truncated, Src);
+ Truncated = DAG.getNode(RISCVISD::FCOPYSIGN_VL, DL, ContainerVT, Truncated,
+ Src, Src, Mask, VL);
- // Determine the largest integer that can be represented exactly. This and
- // values larger than it don't have any fractional bits so don't need to
- // be converted.
- const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
- unsigned Precision = APFloat::semanticsPrecision(FltSem);
- APFloat MaxVal = APFloat(FltSem);
- MaxVal.convertFromAPInt(APInt::getOneBitSet(Precision, Precision - 1),
- /*IsSigned*/ false, APFloat::rmNearestTiesToEven);
- SDValue MaxValNode = DAG.getConstantFP(MaxVal, DL, VT);
+ if (!VT.isFixedLengthVector())
+ return Truncated;
- // If abs(Src) was larger than MaxVal or nan, keep it.
- SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, Src);
- SDValue Setcc = DAG.getSetCC(DL, SetccVT, Abs, MaxValNode, ISD::SETOLT);
- return DAG.getSelect(DL, VT, Setcc, Truncated, Src);
+ return convertFromScalableVector(VT, Truncated, DAG, Subtarget);
}
// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
@@ -3443,7 +3478,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
case ISD::FTRUNC:
case ISD::FCEIL:
case ISD::FFLOOR:
- return lowerFTRUNC_FCEIL_FFLOOR(Op, DAG);
+ return lowerFTRUNC_FCEIL_FFLOOR(Op, DAG, Subtarget);
case ISD::FROUND:
return lowerFROUND(Op, DAG, Subtarget);
case ISD::VECREDUCE_ADD:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 19d8b3647452c..a1ab23acbb220 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -251,7 +251,7 @@ enum NodeType : unsigned {
FNEG_VL,
FABS_VL,
FSQRT_VL,
- FCOPYSIGN_VL,
+ FCOPYSIGN_VL, // Has a merge operand
FP_TO_SINT_VL,
FP_TO_UINT_VL,
SINT_TO_FP_VL,
diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
index 86252ff093a3f..2d4824dc2be48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
@@ -7,20 +7,22 @@
define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
; CHECK-LABEL: ceil_nxv1f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI0_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI0_1)(a0)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flh ft0, %lo(.LCPI0_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
@@ -30,20 +32,22 @@ declare <vscale x 1 x half> @llvm.ceil.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
; CHECK-LABEL: ceil_nxv2f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI1_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI1_1)(a0)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flh ft0, %lo(.LCPI1_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x half> @llvm.ceil.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
@@ -53,20 +57,22 @@ declare <vscale x 2 x half> @llvm.ceil.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
; CHECK-LABEL: ceil_nxv4f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI2_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI2_1)(a0)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flh ft0, %lo(.LCPI2_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
@@ -76,20 +82,23 @@ declare <vscale x 4 x half> @llvm.ceil.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
; CHECK-LABEL: ceil_nxv8f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v10, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI3_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI3_1)(a0)
-; CHECK-NEXT: vfadd.vf v12, v10, ft0
-; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, ft1
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmflt.vf v10, v12, ft0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI3_1)
+; CHECK-NEXT: flh ft0, %lo(.LCPI3_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v11, v12, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vfadd.vf v12, v12, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x half> @llvm.ceil.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
@@ -99,20 +108,23 @@ declare <vscale x 8 x half> @llvm.ceil.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
; CHECK-LABEL: ceil_nxv16f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v12, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI4_1)(a0)
-; CHECK-NEXT: vfadd.vf v16, v12, ft0
-; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, ft1
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vmflt.vf v12, v16, ft0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
+; CHECK-NEXT: flh ft0, %lo(.LCPI4_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v13, v16, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vfadd.vf v16, v16, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 16 x half> @llvm.ceil.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
@@ -122,20 +134,23 @@ declare <vscale x 16 x half> @llvm.ceil.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
; CHECK-LABEL: ceil_nxv32f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v16, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI5_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI5_1)(a0)
-; CHECK-NEXT: vfadd.vf v24, v16, ft0
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu
; CHECK-NEXT: vfabs.v v24, v8
-; CHECK-NEXT: vmflt.vf v0, v24, ft1
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmflt.vf v16, v24, ft0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI5_1)
+; CHECK-NEXT: flh ft0, %lo(.LCPI5_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmflt.vv v17, v24, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vfadd.vf v24, v24, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 32 x half> @llvm.ceil.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
@@ -145,20 +160,22 @@ declare <vscale x 32 x half> @llvm.ceil.nxv32f16(<vscale x 32 x half>)
define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) {
; CHECK-LABEL: ceil_nxv1f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI6_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI6_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI6_1)(a0)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flw ft0, %lo(.LCPI6_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x float> @llvm.ceil.nxv1f32(<vscale x 1 x float> %x)
ret <vscale x 1 x float> %a
@@ -168,20 +185,22 @@ declare <vscale x 1 x float> @llvm.ceil.nxv1f32(<vscale x 1 x float>)
define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) {
; CHECK-LABEL: ceil_nxv2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI7_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI7_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI7_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI7_1)(a0)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flw ft0, %lo(.LCPI7_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x float> @llvm.ceil.nxv2f32(<vscale x 2 x float> %x)
ret <vscale x 2 x float> %a
@@ -191,20 +210,23 @@ declare <vscale x 2 x float> @llvm.ceil.nxv2f32(<vscale x 2 x float>)
define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) {
; CHECK-LABEL: ceil_nxv4f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI8_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v10, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI8_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI8_1)(a0)
-; CHECK-NEXT: vfadd.vf v12, v10, ft0
-; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, ft1
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmflt.vf v10, v12, ft0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI8_1)
+; CHECK-NEXT: flw ft0, %lo(.LCPI8_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v11, v12, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vfadd.vf v12, v12, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> %x)
ret <vscale x 4 x float> %a
@@ -214,20 +236,23 @@ declare <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float>)
define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) {
; CHECK-LABEL: ceil_nxv8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI9_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v12, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI9_1)(a0)
-; CHECK-NEXT: vfadd.vf v16, v12, ft0
-; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, ft1
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vmflt.vf v12, v16, ft0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
+; CHECK-NEXT: flw ft0, %lo(.LCPI9_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v13, v16, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vfadd.vf v16, v16, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x float> @llvm.ceil.nxv8f32(<vscale x 8 x float> %x)
ret <vscale x 8 x float> %a
@@ -237,20 +262,23 @@ declare <vscale x 8 x float> @llvm.ceil.nxv8f32(<vscale x 8 x float>)
define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) {
; CHECK-LABEL: ceil_nxv16f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI10_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v16, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI10_1)(a0)
-; CHECK-NEXT: vfadd.vf v24, v16, ft0
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu
; CHECK-NEXT: vfabs.v v24, v8
-; CHECK-NEXT: vmflt.vf v0, v24, ft1
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmflt.vf v16, v24, ft0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
+; CHECK-NEXT: flw ft0, %lo(.LCPI10_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmflt.vv v17, v24, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vfadd.vf v24, v24, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 16 x float> @llvm.ceil.nxv16f32(<vscale x 16 x float> %x)
ret <vscale x 16 x float> %a
@@ -260,20 +288,22 @@ declare <vscale x 16 x float> @llvm.ceil.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @ceil_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: ceil_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI11_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI11_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI11_1)(a0)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: fld ft0, %lo(.LCPI11_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x double> @llvm.ceil.nxv1f64(<vscale x 1 x double> %x)
ret <vscale x 1 x double> %a
@@ -283,20 +313,23 @@ declare <vscale x 1 x double> @llvm.ceil.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @ceil_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: ceil_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI12_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v10, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI12_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI12_1)(a0)
-; CHECK-NEXT: vfadd.vf v12, v10, ft0
-; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, ft1
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmflt.vf v10, v12, ft0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI12_1)
+; CHECK-NEXT: fld ft0, %lo(.LCPI12_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v11, v12, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vfadd.vf v12, v12, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> %x)
ret <vscale x 2 x double> %a
@@ -306,20 +339,23 @@ declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @ceil_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: ceil_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI13_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v12, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI13_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI13_1)(a0)
-; CHECK-NEXT: vfadd.vf v16, v12, ft0
-; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, ft1
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vmflt.vf v12, v16, ft0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI13_1)
+; CHECK-NEXT: fld ft0, %lo(.LCPI13_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v13, v16, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vfadd.vf v16, v16, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x double> @llvm.ceil.nxv4f64(<vscale x 4 x double> %x)
ret <vscale x 4 x double> %a
@@ -329,20 +365,23 @@ declare <vscale x 4 x double> @llvm.ceil.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @ceil_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: ceil_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI14_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v16, v8
-; CHECK-NEXT: lui a0, %hi(.LCPI14_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI14_1)(a0)
-; CHECK-NEXT: vfadd.vf v24, v16, ft0
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu
; CHECK-NEXT: vfabs.v v24, v8
-; CHECK-NEXT: vmflt.vf v0, v24, ft1
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmflt.vf v16, v24, ft0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI14_1)
+; CHECK-NEXT: fld ft0, %lo(.LCPI14_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vmflt.vv v17, v24, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vfadd.vf v24, v24, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x double> @llvm.ceil.nxv8f64(<vscale x 8 x double> %x)
ret <vscale x 8 x double> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
index a261887305016..999c3f25d9e40 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
@@ -7,20 +7,22 @@
define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
; CHECK-LABEL: floor_nxv1f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI0_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI0_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI0_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI0_1)(a0)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flh ft0, %lo(.LCPI0_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x half> @llvm.floor.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
@@ -30,20 +32,22 @@ declare <vscale x 1 x half> @llvm.floor.nxv1f16(<vscale x 1 x half>)
define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
; CHECK-LABEL: floor_nxv2f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI1_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI1_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI1_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI1_1)(a0)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flh ft0, %lo(.LCPI1_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x half> @llvm.floor.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
@@ -53,20 +57,22 @@ declare <vscale x 2 x half> @llvm.floor.nxv2f16(<vscale x 2 x half>)
define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
; CHECK-LABEL: floor_nxv4f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI2_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI2_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI2_1)(a0)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flh ft0, %lo(.LCPI2_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x half> @llvm.floor.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
@@ -76,20 +82,23 @@ declare <vscale x 4 x half> @llvm.floor.nxv4f16(<vscale x 4 x half>)
define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
; CHECK-LABEL: floor_nxv8f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
; CHECK-NEXT: lui a0, %hi(.LCPI3_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI3_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v10
-; CHECK-NEXT: lui a0, %hi(.LCPI3_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI3_1)(a0)
-; CHECK-NEXT: vfsub.vf v12, v10, ft0
-; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, ft1
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmflt.vf v10, v12, ft0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI3_1)
+; CHECK-NEXT: flh ft0, %lo(.LCPI3_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v11, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vfsub.vf v12, v12, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x half> @llvm.floor.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
@@ -99,20 +108,23 @@ declare <vscale x 8 x half> @llvm.floor.nxv8f16(<vscale x 8 x half>)
define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
; CHECK-LABEL: floor_nxv16f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI4_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v12
-; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI4_1)(a0)
-; CHECK-NEXT: vfsub.vf v16, v12, ft0
-; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, ft1
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vmflt.vf v12, v16, ft0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
+; CHECK-NEXT: flh ft0, %lo(.LCPI4_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v13, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vfsub.vf v16, v16, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 16 x half> @llvm.floor.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
@@ -122,20 +134,23 @@ declare <vscale x 16 x half> @llvm.floor.nxv16f16(<vscale x 16 x half>)
define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
; CHECK-LABEL: floor_nxv32f16:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: lui a0, %hi(.LCPI5_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI5_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v16
-; CHECK-NEXT: lui a0, %hi(.LCPI5_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI5_1)(a0)
-; CHECK-NEXT: vfsub.vf v24, v16, ft0
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu
; CHECK-NEXT: vfabs.v v24, v8
-; CHECK-NEXT: vmflt.vf v0, v24, ft1
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmflt.vf v16, v24, ft0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI5_1)
+; CHECK-NEXT: flh ft0, %lo(.LCPI5_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmflt.vv v17, v8, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vfsub.vf v24, v24, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 32 x half> @llvm.floor.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
@@ -145,20 +160,22 @@ declare <vscale x 32 x half> @llvm.floor.nxv32f16(<vscale x 32 x half>)
define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) {
; CHECK-LABEL: floor_nxv1f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI6_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI6_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI6_1)(a0)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flw ft0, %lo(.LCPI6_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x float> @llvm.floor.nxv1f32(<vscale x 1 x float> %x)
ret <vscale x 1 x float> %a
@@ -168,20 +185,22 @@ declare <vscale x 1 x float> @llvm.floor.nxv1f32(<vscale x 1 x float>)
define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) {
; CHECK-LABEL: floor_nxv2f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI7_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI7_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI7_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI7_1)(a0)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: flw ft0, %lo(.LCPI7_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x float> @llvm.floor.nxv2f32(<vscale x 2 x float> %x)
ret <vscale x 2 x float> %a
@@ -191,20 +210,23 @@ declare <vscale x 2 x float> @llvm.floor.nxv2f32(<vscale x 2 x float>)
define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) {
; CHECK-LABEL: floor_nxv4f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI8_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v10
-; CHECK-NEXT: lui a0, %hi(.LCPI8_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI8_1)(a0)
-; CHECK-NEXT: vfsub.vf v12, v10, ft0
-; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, ft1
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmflt.vf v10, v12, ft0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI8_1)
+; CHECK-NEXT: flw ft0, %lo(.LCPI8_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v11, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vfsub.vf v12, v12, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> %x)
ret <vscale x 4 x float> %a
@@ -214,20 +236,23 @@ declare <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float>)
define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) {
; CHECK-LABEL: floor_nxv8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI9_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v12
-; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI9_1)(a0)
-; CHECK-NEXT: vfsub.vf v16, v12, ft0
-; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, ft1
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vmflt.vf v12, v16, ft0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
+; CHECK-NEXT: flw ft0, %lo(.LCPI9_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v13, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vfsub.vf v16, v16, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x float> @llvm.floor.nxv8f32(<vscale x 8 x float> %x)
ret <vscale x 8 x float> %a
@@ -237,20 +262,23 @@ declare <vscale x 8 x float> @llvm.floor.nxv8f32(<vscale x 8 x float>)
define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) {
; CHECK-LABEL: floor_nxv16f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI10_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v16
-; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI10_1)(a0)
-; CHECK-NEXT: vfsub.vf v24, v16, ft0
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu
; CHECK-NEXT: vfabs.v v24, v8
-; CHECK-NEXT: vmflt.vf v0, v24, ft1
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmflt.vf v16, v24, ft0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
+; CHECK-NEXT: flw ft0, %lo(.LCPI10_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmflt.vv v17, v8, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vfsub.vf v24, v24, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 16 x float> @llvm.floor.nxv16f32(<vscale x 16 x float> %x)
ret <vscale x 16 x float> %a
@@ -260,20 +288,22 @@ declare <vscale x 16 x float> @llvm.floor.nxv16f32(<vscale x 16 x float>)
define <vscale x 1 x double> @floor_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-LABEL: floor_nxv1f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
; CHECK-NEXT: lui a0, %hi(.LCPI11_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI11_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfabs.v v9, v8
+; CHECK-NEXT: vmflt.vf v9, v9, ft0
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a0, %hi(.LCPI11_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI11_1)(a0)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: fld ft0, %lo(.LCPI11_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v8, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v9
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x double> @llvm.floor.nxv1f64(<vscale x 1 x double> %x)
ret <vscale x 1 x double> %a
@@ -283,20 +313,23 @@ declare <vscale x 1 x double> @llvm.floor.nxv1f64(<vscale x 1 x double>)
define <vscale x 2 x double> @floor_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-LABEL: floor_nxv2f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
; CHECK-NEXT: lui a0, %hi(.LCPI12_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI12_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v10
-; CHECK-NEXT: lui a0, %hi(.LCPI12_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI12_1)(a0)
-; CHECK-NEXT: vfsub.vf v12, v10, ft0
-; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0
+; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
-; CHECK-NEXT: vmflt.vf v0, v12, ft1
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vmflt.vf v10, v12, ft0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI12_1)
+; CHECK-NEXT: fld ft0, %lo(.LCPI12_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vmflt.vv v11, v8, v12, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vfsub.vf v12, v12, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> %x)
ret <vscale x 2 x double> %a
@@ -306,20 +339,23 @@ declare <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double>)
define <vscale x 4 x double> @floor_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-LABEL: floor_nxv4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
; CHECK-NEXT: lui a0, %hi(.LCPI13_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI13_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v12
-; CHECK-NEXT: lui a0, %hi(.LCPI13_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI13_1)(a0)
-; CHECK-NEXT: vfsub.vf v16, v12, ft0
-; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
-; CHECK-NEXT: vmflt.vf v0, v16, ft1
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vmflt.vf v12, v16, ft0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI13_1)
+; CHECK-NEXT: fld ft0, %lo(.LCPI13_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmflt.vv v13, v8, v16, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vfsub.vf v16, v16, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x double> @llvm.floor.nxv4f64(<vscale x 4 x double> %x)
ret <vscale x 4 x double> %a
@@ -329,20 +365,23 @@ declare <vscale x 4 x double> @llvm.floor.nxv4f64(<vscale x 4 x double>)
define <vscale x 8 x double> @floor_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-LABEL: floor_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI14_0)(a0)
-; CHECK-NEXT: vmflt.vv v0, v8, v16
-; CHECK-NEXT: lui a0, %hi(.LCPI14_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI14_1)(a0)
-; CHECK-NEXT: vfsub.vf v24, v16, ft0
-; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu
; CHECK-NEXT: vfabs.v v24, v8
-; CHECK-NEXT: vmflt.vf v0, v24, ft1
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vmflt.vf v16, v24, ft0
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfcvt.rtz.x.f.v v24, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT: lui a0, %hi(.LCPI14_1)
+; CHECK-NEXT: fld ft0, %lo(.LCPI14_1)(a0)
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vmflt.vv v17, v8, v24, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; CHECK-NEXT: vmv1r.v v0, v17
+; CHECK-NEXT: vfsub.vf v24, v24, ft0, v0.t
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x double> @llvm.floor.nxv8f64(<vscale x 8 x double> %x)
ret <vscale x 8 x double> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 245a2fdc441eb..c402593a528d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -1972,10 +1972,9 @@ define void @trunc_v8f16(<8 x half>* %x) {
; CHECK-NEXT: flh ft0, %lo(.LCPI91_0)(a1)
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <8 x half>, <8 x half>* %x
@@ -1994,10 +1993,9 @@ define void @trunc_v4f32(<4 x float>* %x) {
; CHECK-NEXT: flw ft0, %lo(.LCPI92_0)(a1)
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%a = load <4 x float>, <4 x float>* %x
@@ -2016,10 +2014,9 @@ define void @trunc_v2f64(<2 x double>* %x) {
; CHECK-NEXT: fld ft0, %lo(.LCPI93_0)(a1)
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: vse64.v v8, (a0)
; CHECK-NEXT: ret
%a = load <2 x double>, <2 x double>* %x
@@ -2033,21 +2030,23 @@ define void @ceil_v8f16(<8 x half>* %x) {
; CHECK-LABEL: ceil_v8f16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
+; CHECK-NEXT: vle16.v v9, (a0)
; CHECK-NEXT: lui a1, %hi(.LCPI94_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI94_0)(a1)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: vmflt.vf v8, v8, ft0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a1, %hi(.LCPI94_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI94_1)(a1)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: flh ft0, %lo(.LCPI94_1)(a1)
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfsgnj.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vse16.v v9, (a0)
; CHECK-NEXT: ret
%a = load <8 x half>, <8 x half>* %x
%b = call <8 x half> @llvm.ceil.v8f16(<8 x half> %a)
@@ -2060,21 +2059,23 @@ define void @ceil_v4f32(<4 x float>* %x) {
; CHECK-LABEL: ceil_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
+; CHECK-NEXT: vle32.v v9, (a0)
; CHECK-NEXT: lui a1, %hi(.LCPI95_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI95_0)(a1)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: vmflt.vf v8, v8, ft0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a1, %hi(.LCPI95_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI95_1)(a1)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: flw ft0, %lo(.LCPI95_1)(a1)
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfsgnj.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vse32.v v9, (a0)
; CHECK-NEXT: ret
%a = load <4 x float>, <4 x float>* %x
%b = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a)
@@ -2087,21 +2088,23 @@ define void @ceil_v2f64(<2 x double>* %x) {
; CHECK-LABEL: ceil_v2f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
+; CHECK-NEXT: vle64.v v9, (a0)
; CHECK-NEXT: lui a1, %hi(.LCPI96_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI96_0)(a1)
-; CHECK-NEXT: vmflt.vv v0, v9, v8
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: vmflt.vf v8, v8, ft0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a1, %hi(.LCPI96_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI96_1)(a1)
-; CHECK-NEXT: vfadd.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: fld ft0, %lo(.LCPI96_1)(a1)
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v10, v9, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfadd.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfsgnj.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vse64.v v9, (a0)
; CHECK-NEXT: ret
%a = load <2 x double>, <2 x double>* %x
%b = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a)
@@ -2114,21 +2117,23 @@ define void @floor_v8f16(<8 x half>* %x) {
; CHECK-LABEL: floor_v8f16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
+; CHECK-NEXT: vle16.v v9, (a0)
; CHECK-NEXT: lui a1, %hi(.LCPI97_0)
; CHECK-NEXT: flh ft0, %lo(.LCPI97_0)(a1)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: vmflt.vf v8, v8, ft0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a1, %hi(.LCPI97_1)
-; CHECK-NEXT: flh ft1, %lo(.LCPI97_1)(a1)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: flh ft0, %lo(.LCPI97_1)(a1)
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfsgnj.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vse16.v v9, (a0)
; CHECK-NEXT: ret
%a = load <8 x half>, <8 x half>* %x
%b = call <8 x half> @llvm.floor.v8f16(<8 x half> %a)
@@ -2141,21 +2146,23 @@ define void @floor_v4f32(<4 x float>* %x) {
; CHECK-LABEL: floor_v4f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
+; CHECK-NEXT: vle32.v v9, (a0)
; CHECK-NEXT: lui a1, %hi(.LCPI98_0)
; CHECK-NEXT: flw ft0, %lo(.LCPI98_0)(a1)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: vmflt.vf v8, v8, ft0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a1, %hi(.LCPI98_1)
-; CHECK-NEXT: flw ft1, %lo(.LCPI98_1)(a1)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT: vse32.v v8, (a0)
+; CHECK-NEXT: flw ft0, %lo(.LCPI98_1)(a1)
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfsgnj.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vse32.v v9, (a0)
; CHECK-NEXT: ret
%a = load <4 x float>, <4 x float>* %x
%b = call <4 x float> @llvm.floor.v4f32(<4 x float> %a)
@@ -2168,21 +2175,23 @@ define void @floor_v2f64(<2 x double>* %x) {
; CHECK-LABEL: floor_v2f64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
+; CHECK-NEXT: vle64.v v9, (a0)
; CHECK-NEXT: lui a1, %hi(.LCPI99_0)
; CHECK-NEXT: fld ft0, %lo(.LCPI99_0)(a1)
-; CHECK-NEXT: vmflt.vv v0, v8, v9
+; CHECK-NEXT: vfabs.v v8, v9
+; CHECK-NEXT: vmflt.vf v8, v8, ft0
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v9, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
; CHECK-NEXT: lui a1, %hi(.LCPI99_1)
-; CHECK-NEXT: fld ft1, %lo(.LCPI99_1)(a1)
-; CHECK-NEXT: vfsub.vf v10, v9, ft0
-; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0
-; CHECK-NEXT: vfabs.v v10, v8
-; CHECK-NEXT: vmflt.vf v0, v10, ft1
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
-; CHECK-NEXT: vse64.v v8, (a0)
+; CHECK-NEXT: fld ft0, %lo(.LCPI99_1)(a1)
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmflt.vv v0, v9, v10, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu
+; CHECK-NEXT: vfsub.vf v10, v10, ft0, v0.t
+; CHECK-NEXT: vmv.v.v v0, v8
+; CHECK-NEXT: vfsgnj.vv v9, v10, v9, v0.t
+; CHECK-NEXT: vse64.v v9, (a0)
; CHECK-NEXT: ret
%a = load <2 x double>, <2 x double>* %x
%b = call <2 x double> @llvm.floor.v2f64(<2 x double> %a)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
index 80ff02524a809..23a2574d4fbad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
@@ -12,10 +12,9 @@ define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) {
; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x half> @llvm.trunc.nxv1f16(<vscale x 1 x half> %x)
ret <vscale x 1 x half> %a
@@ -30,10 +29,9 @@ define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) {
; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x half> @llvm.trunc.nxv2f16(<vscale x 2 x half> %x)
ret <vscale x 2 x half> %a
@@ -48,10 +46,9 @@ define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) {
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x half> @llvm.trunc.nxv4f16(<vscale x 4 x half> %x)
ret <vscale x 4 x half> %a
@@ -66,10 +63,9 @@ define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) {
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x half> @llvm.trunc.nxv8f16(<vscale x 8 x half> %x)
ret <vscale x 8 x half> %a
@@ -84,10 +80,9 @@ define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) {
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 16 x half> @llvm.trunc.nxv16f16(<vscale x 16 x half> %x)
ret <vscale x 16 x half> %a
@@ -102,10 +97,9 @@ define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) {
; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 32 x half> @llvm.trunc.nxv32f16(<vscale x 32 x half> %x)
ret <vscale x 32 x half> %a
@@ -120,10 +114,9 @@ define <vscale x 1 x float> @trunc_nxv1f32(<vscale x 1 x float> %x) {
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x float> @llvm.trunc.nxv1f32(<vscale x 1 x float> %x)
ret <vscale x 1 x float> %a
@@ -138,10 +131,9 @@ define <vscale x 2 x float> @trunc_nxv2f32(<vscale x 2 x float> %x) {
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x float> @llvm.trunc.nxv2f32(<vscale x 2 x float> %x)
ret <vscale x 2 x float> %a
@@ -156,10 +148,9 @@ define <vscale x 4 x float> @trunc_nxv4f32(<vscale x 4 x float> %x) {
; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> %x)
ret <vscale x 4 x float> %a
@@ -174,10 +165,9 @@ define <vscale x 8 x float> @trunc_nxv8f32(<vscale x 8 x float> %x) {
; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x float> @llvm.trunc.nxv8f32(<vscale x 8 x float> %x)
ret <vscale x 8 x float> %a
@@ -192,10 +182,9 @@ define <vscale x 16 x float> @trunc_nxv16f32(<vscale x 16 x float> %x) {
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 16 x float> @llvm.trunc.nxv16f32(<vscale x 16 x float> %x)
ret <vscale x 16 x float> %a
@@ -210,10 +199,9 @@ define <vscale x 1 x double> @trunc_nxv1f64(<vscale x 1 x double> %x) {
; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu
; CHECK-NEXT: vfabs.v v9, v8
; CHECK-NEXT: vmflt.vf v0, v9, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8
-; CHECK-NEXT: vfcvt.f.x.v v9, v9
-; CHECK-NEXT: vfsgnj.vv v9, v9, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 1 x double> @llvm.trunc.nxv1f64(<vscale x 1 x double> %x)
ret <vscale x 1 x double> %a
@@ -228,10 +216,9 @@ define <vscale x 2 x double> @trunc_nxv2f64(<vscale x 2 x double> %x) {
; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
; CHECK-NEXT: vfabs.v v10, v8
; CHECK-NEXT: vmflt.vf v0, v10, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8
-; CHECK-NEXT: vfcvt.f.x.v v10, v10
-; CHECK-NEXT: vfsgnj.vv v10, v10, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> %x)
ret <vscale x 2 x double> %a
@@ -246,10 +233,9 @@ define <vscale x 4 x double> @trunc_nxv4f64(<vscale x 4 x double> %x) {
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu
; CHECK-NEXT: vfabs.v v12, v8
; CHECK-NEXT: vmflt.vf v0, v12, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8
-; CHECK-NEXT: vfcvt.f.x.v v12, v12
-; CHECK-NEXT: vfsgnj.vv v12, v12, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 4 x double> @llvm.trunc.nxv4f64(<vscale x 4 x double> %x)
ret <vscale x 4 x double> %a
@@ -264,10 +250,9 @@ define <vscale x 8 x double> @trunc_nxv8f64(<vscale x 8 x double> %x) {
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu
; CHECK-NEXT: vfabs.v v16, v8
; CHECK-NEXT: vmflt.vf v0, v16, ft0
-; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8
-; CHECK-NEXT: vfcvt.f.x.v v16, v16
-; CHECK-NEXT: vfsgnj.vv v16, v16, v8
-; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t
+; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t
; CHECK-NEXT: ret
%a = call <vscale x 8 x double> @llvm.trunc.nxv8f64(<vscale x 8 x double> %x)
ret <vscale x 8 x double> %a
More information about the llvm-commits
mailing list