[llvm] 5280d3e - [RISCV] Teach lowerCTLZ_CTTZ_ZERO_UNDEF to handle conversion i32/i64 vectors to f32 vectors.
Yeting Kuo via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 11 22:42:55 PST 2023
Author: Yeting Kuo
Date: 2023-01-12T14:42:47+08:00
New Revision: 5280d3e7384835bb6ee797def32c98f30afaee98
URL: https://github.com/llvm/llvm-project/commit/5280d3e7384835bb6ee797def32c98f30afaee98
DIFF: https://github.com/llvm/llvm-project/commit/5280d3e7384835bb6ee797def32c98f30afaee98.diff
LOG: [RISCV] Teach lowerCTLZ_CTTZ_ZERO_UNDEF to handle conversion i32/i64 vectors to f32 vectors.
Previously lowerCTLZ_CTTZ_ZERO_UNDEF converted the source to float value by
ISD::UINT_TO_FP. ISD::UINT_TO_FP uses dynamic rounding mode, so the rounding
may make the exponent of the result not as expected when converting i32/i64 to f32.
This is the reason why we constrained lowerCTLZ_CTTZ_ZERO_UNDEF to only handle
an i32 source when the f64 type having the same element count as source is legal.
The patch teaches lowerCTLZ_CTTZ_ZERO_UNDEF converts i32/i64 vectors to f32
vectors by vfcvt.f.xu.v with RTZ rounding mode. Using RTZ is to make sure the
exponent of results is correct, although f32 could not totally represent each
value in i32/i64.
Reviewed By: craig.topper
Differential Revision: https://reviews.llvm.org/D140782
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 169ff9d22f989..5c8bd226b4db3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -677,16 +677,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// Splice
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
- // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
- // type that can represent the value exactly.
- if (VT.getVectorElementType() != MVT::i64) {
- MVT FloatEltVT =
- VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
- EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
- if (isTypeLegal(FloatVT)) {
- setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
- Custom);
- }
+ // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the range
+ // of f32.
+ EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ if (isTypeLegal(FloatVT)) {
+ setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+ Custom);
}
}
@@ -912,17 +908,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(IntegerVPOps, VT, Custom);
- // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
- // type that can represent the value exactly.
- if (VT.getVectorElementType() != MVT::i64) {
- MVT FloatEltVT =
- VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
- EVT FloatVT =
- MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
- if (isTypeLegal(FloatVT))
- setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
- Custom);
- }
+ // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
+ // range of f32.
+ EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ if (isTypeLegal(FloatVT))
+ setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+ Custom);
}
for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
@@ -3535,15 +3526,20 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
// Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
// the exponent.
-static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+SDValue
+RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
+ SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
unsigned EltSize = VT.getScalarSizeInBits();
SDValue Src = Op.getOperand(0);
SDLoc DL(Op);
- // We need a FP type that can represent the value.
+ // We choose FP type that can represent the value if possible. Otherwise, we
+ // use rounding to zero conversion for correct exponent of the result.
// TODO: Use f16 for i8 when possible?
- MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32;
+ MVT FloatEltVT = (EltSize >= 32) ? MVT::f64 : MVT::f32;
+ if (!isTypeLegal(MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount())))
+ FloatEltVT = MVT::f32;
MVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
// Legal types should have been checked in the RISCVTargetLowering
@@ -3560,27 +3556,50 @@ static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
}
// We have a legal FP type, convert to it.
- SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+ SDValue FloatVal;
+ if (FloatVT.bitsGT(VT)) {
+ FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+ } else {
+ // Use RTZ to avoid rounding influencing exponent of FloatVal.
+ MVT ContainerVT = VT;
+ if (VT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(VT);
+ Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+ }
+
+ auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+ SDValue RTZRM =
+ DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, Subtarget.getXLenVT());
+ MVT ContainerFloatVT =
+ MVT::getVectorVT(FloatEltVT, ContainerVT.getVectorElementCount());
+ FloatVal = DAG.getNode(RISCVISD::VFCVT_RM_F_XU_VL, DL, ContainerFloatVT,
+ Src, Mask, RTZRM, VL);
+ if (VT.isFixedLengthVector())
+ FloatVal = convertFromScalableVector(FloatVT, FloatVal, DAG, Subtarget);
+ }
// Bitcast to integer and shift the exponent to the LSB.
EVT IntVT = FloatVT.changeVectorElementTypeToInteger();
SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal);
unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23;
- SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
- DAG.getConstant(ShiftAmt, DL, IntVT));
- // Truncate back to original type to allow vnsrl.
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift);
+ SDValue Exp = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
+ DAG.getConstant(ShiftAmt, DL, IntVT));
+ // Restore back to original type. Truncation after SRL is to generate vnsrl.
+ if (IntVT.bitsLT(VT))
+ Exp = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Exp);
+ else if (IntVT.bitsGT(VT))
+ Exp = DAG.getNode(ISD::TRUNCATE, DL, VT, Exp);
// The exponent contains log2 of the value in biased form.
unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127;
// For trailing zeros, we just need to subtract the bias.
if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF)
- return DAG.getNode(ISD::SUB, DL, VT, Trunc,
+ return DAG.getNode(ISD::SUB, DL, VT, Exp,
DAG.getConstant(ExponentBias, DL, VT));
// For leading zeros, we need to remove the bias and convert from log2 to
// leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)).
unsigned Adjust = ExponentBias + (EltSize - 1);
- return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc);
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Exp);
}
// While RVV has alignment restrictions, we should always be able to load as a
@@ -11571,6 +11590,28 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_MF2_MASK);
case RISCV::PseudoVFCVT_RM_X_F_V_MF4_MASK:
return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_MF4_MASK);
+ case RISCV::PseudoVFCVT_RM_F_XU_V_M1_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M1_MASK);
+ case RISCV::PseudoVFCVT_RM_F_XU_V_M2_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M2_MASK);
+ case RISCV::PseudoVFCVT_RM_F_XU_V_M4_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M4_MASK);
+ case RISCV::PseudoVFCVT_RM_F_XU_V_M8_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M8_MASK);
+ case RISCV::PseudoVFCVT_RM_F_XU_V_MF2_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_MF2_MASK);
+ case RISCV::PseudoVFCVT_RM_F_XU_V_MF4_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_MF4_MASK);
+ case RISCV::PseudoVFNCVT_RM_F_XU_W_M1_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M1_MASK);
+ case RISCV::PseudoVFNCVT_RM_F_XU_W_M2_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M2_MASK);
+ case RISCV::PseudoVFNCVT_RM_F_XU_W_M4_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M4_MASK);
+ case RISCV::PseudoVFNCVT_RM_F_XU_W_MF2_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_MF2_MASK);
+ case RISCV::PseudoVFNCVT_RM_F_XU_W_MF4_MASK:
+ return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_MF4_MASK);
case RISCV::PseudoVFROUND_NOEXCEPT_V_M1_MASK:
return emitVFROUND_NOEXCEPT_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_M1_MASK,
RISCV::PseudoVFCVT_F_X_V_M1_MASK);
@@ -13167,6 +13208,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VFROUND_NOEXCEPT_VL)
NODE_NAME_CASE(SINT_TO_FP_VL)
NODE_NAME_CASE(UINT_TO_FP_VL)
+ NODE_NAME_CASE(VFCVT_RM_F_XU_VL)
NODE_NAME_CASE(FP_EXTEND_VL)
NODE_NAME_CASE(FP_ROUND_VL)
NODE_NAME_CASE(VWMUL_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 883715a0ceec3..3de2e4dd02328 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -243,6 +243,7 @@ enum NodeType : unsigned {
VFCVT_RM_X_F_VL, // Has a rounding mode operand.
SINT_TO_FP_VL,
UINT_TO_FP_VL,
+ VFCVT_RM_F_XU_VL, // Has a rounding mode operand.
FP_ROUND_VL,
FP_EXTEND_VL,
@@ -704,6 +705,7 @@ class RISCVTargetLowering : public TargetLowering {
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 291fdd93f0537..f2d22048babc9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3406,6 +3406,17 @@ multiclass VPseudoVCVTF_V {
}
}
+multiclass VPseudoVCVTF_RM_V {
+ foreach m = MxListF in {
+ defvar mx = m.MX;
+ defvar WriteVFCvtIToFV_MX = !cast<SchedWrite>("WriteVFCvtIToFV_" # mx);
+ defvar ReadVFCvtIToFV_MX = !cast<SchedRead>("ReadVFCvtIToFV_" # mx);
+
+ defm _V : VPseudoConversionRM<m.vrclass, m.vrclass, m>,
+ Sched<[WriteVFCvtIToFV_MX, ReadVFCvtIToFV_MX, ReadVMask]>;
+ }
+}
+
multiclass VPseudoConversionW_V {
defvar constraint = "@earlyclobber $rd";
foreach m = MxListW in
@@ -3472,6 +3483,18 @@ multiclass VPseudoVNCVTF_W {
}
}
+multiclass VPseudoVNCVTF_RM_W {
+ defvar constraint = "@earlyclobber $rd";
+ foreach m = MxListFW in {
+ defvar mx = m.MX;
+ defvar WriteVFNCvtIToFV_MX = !cast<SchedWrite>("WriteVFNCvtIToFV_" # mx);
+ defvar ReadVFNCvtIToFV_MX = !cast<SchedRead>("ReadVFNCvtIToFV_" # mx);
+
+ defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint>,
+ Sched<[WriteVFNCvtIToFV_MX, ReadVFNCvtIToFV_MX, ReadVMask]>;
+ }
+}
+
multiclass VPseudoVNCVTD_W {
defvar constraint = "@earlyclobber $rd";
foreach m = MxListFW in {
@@ -5495,6 +5518,7 @@ let Uses = [FRM] in {
defm PseudoVFCVT_F_XU : VPseudoVCVTF_V;
defm PseudoVFCVT_F_X : VPseudoVCVTF_V;
}
+defm PseudoVFCVT_RM_F_XU : VPseudoVCVTF_RM_V;
} // mayRaiseFPException = true
//===----------------------------------------------------------------------===//
@@ -5528,6 +5552,7 @@ defm PseudoVFNCVT_F_X : VPseudoVNCVTF_W;
defm PseudoVFNCVT_F_F : VPseudoVNCVTD_W;
}
defm PseudoVFNCVT_ROD_F_F : VPseudoVNCVTD_W;
+defm PseudoVFNCVT_RM_F_XU : VPseudoVNCVTF_RM_W;
} // mayRaiseFPException = true
} // Predicates = [HasVInstructionsAnyF]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 09b94b29cef8c..bbb55f8ef257e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -140,11 +140,17 @@ def SDT_RISCVI2FPOp_VL : SDTypeProfile<1, 3, [
SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>,
SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
]>;
+def SDT_RISCVI2FPOp_RM_VL : SDTypeProfile<1, 4, [
+ SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>,
+ SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>,
+ SDTCisVT<4, XLenVT>
+]>;
def riscv_vfcvt_rtz_x_f_vl : SDNode<"RISCVISD::VFCVT_RTZ_X_F_VL", SDT_RISCVFP2IOp_VL>;
def riscv_vfcvt_rtz_xu_f_vl : SDNode<"RISCVISD::VFCVT_RTZ_XU_F_VL", SDT_RISCVFP2IOp_VL>;
def riscv_sint_to_fp_vl : SDNode<"RISCVISD::SINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>;
def riscv_uint_to_fp_vl : SDNode<"RISCVISD::UINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>;
+def riscv_vfcvt_rm_f_xu_vl : SDNode<"RISCVISD::VFCVT_RM_F_XU_VL", SDT_RISCVI2FPOp_RM_VL>;
def SDT_RISCVVecCvtF2XOp_VL : SDTypeProfile<1, 4, [
SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>,
@@ -796,6 +802,18 @@ multiclass VPatConvertI2FPVL_V<SDNode vop, string instruction_name> {
}
}
+multiclass VPatConvertI2FP_RM_VL_V<SDNode vop, string instruction_name> {
+ foreach fvti = AllFloatVectors in {
+ defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+ def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
+ (ivti.Mask V0), (XLenVT timm:$frm),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
+ (ivti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+ }
+}
+
multiclass VPatWConvertFP2IVL_V<SDNode vop, string instruction_name> {
foreach fvtiToFWti = AllWidenableFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
@@ -848,6 +866,19 @@ multiclass VPatNConvertI2FPVL_V<SDNode vop, string instruction_name> {
}
}
+multiclass VPatNConvertI2FP_RM_VL_V<SDNode vop, string instruction_name> {
+ foreach fvtiToFWti = AllWidenableFloatVectors in {
+ defvar fvti = fvtiToFWti.Vti;
+ defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+ def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1),
+ (iwti.Mask V0), (XLenVT timm:$frm),
+ VLOpFrag)),
+ (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+ (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1,
+ (iwti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+ }
+}
+
multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
@@ -1713,6 +1744,7 @@ foreach fvti = AllFloatVectors in {
defm : VPatConvertFP2IVL_V<riscv_vfcvt_rtz_xu_f_vl, "PseudoVFCVT_RTZ_XU_F_V">;
defm : VPatConvertI2FPVL_V<riscv_sint_to_fp_vl, "PseudoVFCVT_F_X_V">;
defm : VPatConvertI2FPVL_V<riscv_uint_to_fp_vl, "PseudoVFCVT_F_XU_V">;
+ defm : VPatConvertI2FP_RM_VL_V<riscv_vfcvt_rm_f_xu_vl, "PseudoVFCVT_RM_F_XU_V">;
// 13.18. Widening Floating-Point/Integer Type-Convert Instructions
defm : VPatWConvertFP2IVL_V<riscv_vfcvt_rtz_x_f_vl, "PseudoVFWCVT_RTZ_X_F_V">;
@@ -1735,6 +1767,8 @@ foreach fvti = AllFloatVectors in {
defm : VPatNConvertFP2IVL_V<riscv_vfcvt_rtz_xu_f_vl, "PseudoVFNCVT_RTZ_XU_F_W">;
defm : VPatNConvertI2FPVL_V<riscv_sint_to_fp_vl, "PseudoVFNCVT_F_X_W">;
defm : VPatNConvertI2FPVL_V<riscv_uint_to_fp_vl, "PseudoVFNCVT_F_XU_W">;
+ defm :
+ VPatNConvertI2FP_RM_VL_V<riscv_vfcvt_rm_f_xu_vl, "PseudoVFNCVT_RM_F_XU_W">;
foreach fvtiToFWti = AllWidenableFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index 27250b4274956..634d0850cc45a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV32,RV32I
; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV64
; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64
@@ -29,6 +31,20 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv1i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vrsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv1i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
@@ -72,6 +88,20 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv2i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vrsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv2i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
@@ -115,6 +145,20 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv4i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vrsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv4i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma
@@ -158,6 +202,20 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv8i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT: vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vrsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv8i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma
@@ -201,6 +259,20 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv16i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT: vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v10, v12, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vrsub.vx v8, v10, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv16i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma
@@ -344,6 +416,18 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv1i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v9, v9, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv1i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
@@ -427,6 +511,18 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv2i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v9, v9, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv2i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
@@ -510,6 +606,18 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv4i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v9, v9, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv4i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma
@@ -593,6 +701,18 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv8i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT: vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v10, v10, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v10, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv8i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma
@@ -676,6 +796,18 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv16i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT: vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v12, v12, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v12, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv16i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma
@@ -836,6 +968,21 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv1i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v9, v9, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v9, v9, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv1i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
@@ -929,6 +1076,21 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv2i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v9, v9, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v9, v9, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv2i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma
@@ -1022,6 +1184,21 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv4i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v10, v10, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v10, v10, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv4i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma
@@ -1115,6 +1292,21 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_nxv8i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v12, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v12, v12, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v12, v12, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_nxv8i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma
@@ -1136,481 +1328,643 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
declare <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32>, i1)
define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: ctlz_nxv16i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_nxv16i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v16, v16, a0
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v16, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv16i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_nxv16i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v16, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_nxv16i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v16, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v16, v16, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v16, v16, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_nxv16i32:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v16, v8, v0.t
+; CHECK-D-NEXT: vsrl.vi v16, v16, 23
+; CHECK-D-NEXT: li a1, 158
+; CHECK-D-NEXT: vrsub.vx v16, v16, a1
+; CHECK-D-NEXT: vmseq.vi v0, v8, 0
+; CHECK-D-NEXT: li a1, 32
+; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
ret <vscale x 16 x i32> %a
}
declare <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32>, i1)
define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: ctlz_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 1
-; RV32-NEXT: vand.vv v9, v11, v9
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: vand.vv v9, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v11
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_nxv1i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v9, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 1
+; RV32I-NEXT: vand.vv v9, v11, v9
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vmul.vv v8, v8, v10
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI18_0)
-; RV64-NEXT: ld a0, %lo(.LCPI18_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI18_1)
-; RV64-NEXT: ld a1, %lo(.LCPI18_1)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: vand.vx v9, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: lui a0, %hi(.LCPI18_2)
-; RV64-NEXT: ld a0, %lo(.LCPI18_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI18_3)
-; RV64-NEXT: ld a1, %lo(.LCPI18_3)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_nxv1i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v9, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI18_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI18_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI18_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI18_1)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v9, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI18_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI18_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI18_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI18_3)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_nxv1i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v9, v9, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v9
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v9, v10, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 64
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_nxv1i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v9, v9, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v9, v9, a1
+; CHECK-D-NEXT: vmseq.vi v0, v8, 0
+; CHECK-D-NEXT: li a1, 64
+; CHECK-D-NEXT: vmerge.vxm v8, v9, a1, v0
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 false)
ret <vscale x 1 x i64> %a
}
declare <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64>, i1)
define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: ctlz_nxv2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v10, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 1
-; RV32-NEXT: vand.vv v10, v14, v10
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vand.vv v10, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v14
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vmul.vv v8, v8, v12
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_nxv2i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v10, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 1
+; RV32I-NEXT: vand.vv v10, v14, v10
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vmul.vv v8, v8, v12
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI19_0)
-; RV64-NEXT: ld a0, %lo(.LCPI19_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI19_1)
-; RV64-NEXT: ld a1, %lo(.LCPI19_1)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: vand.vx v10, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: lui a0, %hi(.LCPI19_2)
-; RV64-NEXT: ld a0, %lo(.LCPI19_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI19_3)
-; RV64-NEXT: ld a1, %lo(.LCPI19_3)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_nxv2i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v10, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI19_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI19_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI19_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI19_1)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v10, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI19_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI19_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI19_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI19_3)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_nxv2i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v10, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v10
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v10, v12, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 64
+; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_nxv2i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v10, v10, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v10, v10, a1
+; CHECK-D-NEXT: vmseq.vi v0, v8, 0
+; CHECK-D-NEXT: li a1, 64
+; CHECK-D-NEXT: vmerge.vxm v8, v10, a1, v0
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 false)
ret <vscale x 2 x i64> %a
}
declare <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64>, i1)
define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: ctlz_nxv4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v12, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 1
-; RV32-NEXT: vand.vv v12, v20, v12
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: vand.vv v12, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v20
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: ctlz_nxv4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI20_0)
-; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI20_1)
-; RV64-NEXT: ld a1, %lo(.LCPI20_1)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: vand.vx v12, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: lui a0, %hi(.LCPI20_2)
-; RV64-NEXT: ld a0, %lo(.LCPI20_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI20_3)
-; RV64-NEXT: ld a1, %lo(.LCPI20_3)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
- %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
- ret <vscale x 4 x i64> %a
-}
-declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
-
-define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: ctlz_nxv8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v16, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: vand.vv v16, v0, v16
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v8, v24
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmul.vv v8, v8, v24
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_nxv4i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v12, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 1
+; RV32I-NEXT: vand.vv v12, v20, v12
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v16
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v20
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vmul.vv v8, v8, v16
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_nxv8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI21_0)
-; RV64-NEXT: ld a0, %lo(.LCPI21_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI21_1)
-; RV64-NEXT: ld a1, %lo(.LCPI21_1)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: vand.vx v16, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: lui a0, %hi(.LCPI21_2)
-; RV64-NEXT: ld a0, %lo(.LCPI21_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI21_3)
-; RV64-NEXT: ld a1, %lo(.LCPI21_3)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_nxv4i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v12, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI20_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI20_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI20_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI20_1)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v12, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI20_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI20_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI20_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI20_3)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_nxv4i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v12, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v16, v12
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v12, v16, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 64
+; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_nxv4i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v12, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v12, v12, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v12, v12, a1
+; CHECK-D-NEXT: vmseq.vi v0, v8, 0
+; CHECK-D-NEXT: li a1, 64
+; CHECK-D-NEXT: vmerge.vxm v8, v12, a1, v0
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
+ %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
+ ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
+
+define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
+; RV32I-LABEL: ctlz_nxv8i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v16, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 1
+; RV32I-NEXT: vand.vv v16, v0, v16
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: vand.vv v16, v8, v24
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v0
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vmul.vv v8, v8, v24
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: ctlz_nxv8i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v16, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI21_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI21_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI21_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI21_1)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v16, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI21_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI21_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI21_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI21_3)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_nxv8i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v16, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v24, v16
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v16, v24, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 64
+; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_nxv8i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v16, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v16, v16, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v16, v16, a1
+; CHECK-D-NEXT: vmseq.vi v0, v8, 0
+; CHECK-D-NEXT: li a1, 64
+; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 false)
ret <vscale x 8 x i64> %a
}
@@ -1641,6 +1995,18 @@ define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv1i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
@@ -1681,6 +2047,18 @@ define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv2i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
@@ -1721,6 +2099,18 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv4i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma
@@ -1761,6 +2151,18 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT: vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v10, v8, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vrsub.vx v8, v10, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv8i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma
@@ -1801,6 +2203,18 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv16i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT: vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v12, v8, 0
+; CHECK-F-NEXT: li a0, 134
+; CHECK-F-NEXT: vrsub.vx v8, v12, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv16i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma
@@ -1939,6 +2353,15 @@ define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv1i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
@@ -2018,6 +2441,15 @@ define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv2i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
@@ -2097,6 +2529,15 @@ define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv4i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma
@@ -2176,6 +2617,15 @@ define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv8i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma
@@ -2255,6 +2705,15 @@ define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv16i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT: li a0, 142
+; CHECK-F-NEXT: vrsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv16i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma
@@ -2410,6 +2869,18 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv1i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
@@ -2499,6 +2970,18 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv2i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma
@@ -2588,6 +3071,18 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv4i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma
@@ -2677,6 +3172,18 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: ctlz_zero_undef_nxv8i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma
@@ -2694,477 +3201,609 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
}
define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv16i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv16i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v16, v16, a0
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v16, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv16i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv16i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v16, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv16i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 158
+; CHECK-F-NEXT: vrsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: vsrl.vi v8, v8, 23
+; CHECK-D-NEXT: li a1, 158
+; CHECK-D-NEXT: vrsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 true)
ret <vscale x 16 x i32> %a
}
define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vi v9, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v9, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v9, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v9
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 1
-; RV32-NEXT: vand.vv v9, v11, v9
-; RV32-NEXT: vsub.vv v8, v8, v9
-; RV32-NEXT: vand.vv v9, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v9, v8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v11
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv1i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsrl.vi v9, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: vsrl.vi v9, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v9, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v9
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 1
+; RV32I-NEXT: vand.vv v9, v11, v9
+; RV32I-NEXT: vsub.vv v8, v8, v9
+; RV32I-NEXT: vand.vv v9, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v9, v8
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vmul.vv v8, v8, v10
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vsrl.vi v9, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v9, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v9
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI40_0)
-; RV64-NEXT: ld a0, %lo(.LCPI40_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI40_1)
-; RV64-NEXT: ld a1, %lo(.LCPI40_1)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: vand.vx v9, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: lui a0, %hi(.LCPI40_2)
-; RV64-NEXT: ld a0, %lo(.LCPI40_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI40_3)
-; RV64-NEXT: ld a1, %lo(.LCPI40_3)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv1i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vsrl.vi v9, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v9, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v9
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI40_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI40_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI40_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI40_1)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v9, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI40_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI40_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI40_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI40_3)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v9, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v8, v9, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 true)
ret <vscale x 1 x i64> %a
}
define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vi v10, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: vsrl.vi v10, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v10, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v10
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 1
-; RV32-NEXT: vand.vv v10, v14, v10
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vand.vv v10, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v14
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vmul.vv v8, v8, v12
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv2i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsrl.vi v10, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: vsrl.vi v10, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v10, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v10
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 1
+; RV32I-NEXT: vand.vv v10, v14, v10
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vmul.vv v8, v8, v12
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vsrl.vi v10, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v10, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v10
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI41_0)
-; RV64-NEXT: ld a0, %lo(.LCPI41_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI41_1)
-; RV64-NEXT: ld a1, %lo(.LCPI41_1)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: vand.vx v10, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: lui a0, %hi(.LCPI41_2)
-; RV64-NEXT: ld a0, %lo(.LCPI41_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI41_3)
-; RV64-NEXT: ld a1, %lo(.LCPI41_3)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv2i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vsrl.vi v10, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v10, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v10
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI41_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI41_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI41_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI41_1)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v10, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI41_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI41_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI41_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI41_3)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v8
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v8, v10, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 true)
ret <vscale x 2 x i64> %a
}
define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT: vsrl.vi v12, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: vsrl.vi v12, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v12, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v12
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 1
-; RV32-NEXT: vand.vv v12, v20, v12
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: vand.vv v12, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v20
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv4i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsrl.vi v12, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: vsrl.vi v12, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v12, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v12
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 1
+; RV32I-NEXT: vand.vv v12, v20, v12
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v16
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v20
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vmul.vv v8, v8, v16
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vsrl.vi v12, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v12, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v12
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI42_0)
-; RV64-NEXT: ld a0, %lo(.LCPI42_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI42_1)
-; RV64-NEXT: ld a1, %lo(.LCPI42_1)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: vand.vx v12, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: lui a0, %hi(.LCPI42_2)
-; RV64-NEXT: ld a0, %lo(.LCPI42_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI42_3)
-; RV64-NEXT: ld a1, %lo(.LCPI42_3)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv4i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vsrl.vi v12, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v12, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v12
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI42_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI42_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI42_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI42_1)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v12, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI42_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI42_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI42_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI42_3)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v8
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v8, v12, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 true)
ret <vscale x 4 x i64> %a
}
define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 2
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 8
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 16
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsrl.vx v16, v8, a0
-; RV32-NEXT: vor.vv v8, v8, v16
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: vand.vv v16, v0, v16
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v8, v24
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v24
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmul.vv v8, v8, v24
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: ctlz_zero_undef_nxv8i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 2
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 8
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 16
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: li a0, 32
+; RV32I-NEXT: vsrl.vx v16, v8, a0
+; RV32I-NEXT: vor.vv v8, v8, v16
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 1
+; RV32I-NEXT: vand.vv v16, v0, v16
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: vand.vv v16, v8, v24
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v24
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v0
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vmul.vv v8, v8, v24
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: ctlz_zero_undef_nxv8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 2
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 8
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 16
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: vsrl.vx v16, v8, a0
-; RV64-NEXT: vor.vv v8, v8, v16
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: lui a0, %hi(.LCPI43_0)
-; RV64-NEXT: ld a0, %lo(.LCPI43_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI43_1)
-; RV64-NEXT: ld a1, %lo(.LCPI43_1)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: vand.vx v16, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: lui a0, %hi(.LCPI43_2)
-; RV64-NEXT: ld a0, %lo(.LCPI43_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI43_3)
-; RV64-NEXT: ld a1, %lo(.LCPI43_3)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: ctlz_zero_undef_nxv8i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 2
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 8
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 16
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: li a0, 32
+; RV64I-NEXT: vsrl.vx v16, v8, a0
+; RV64I-NEXT: vor.vv v8, v8, v16
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI43_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI43_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI43_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI43_1)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v16, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI43_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI43_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI43_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI43_3)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v16, v8
+; CHECK-F-NEXT: li a1, 190
+; CHECK-F-NEXT: vrsub.vx v8, v16, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1086
+; CHECK-D-NEXT: vrsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 true)
ret <vscale x 8 x i64> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index fd02061ffbc15..439e63aa68243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV32,RV32I
; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV64,RV64I
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV32,RV32F
+; RUN: llc -mtriple=riscv64 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV64,RV64F
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32,RV32D
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64,RV64D
define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
; CHECK-ZVE64X-LABEL: cttz_nxv1i8:
@@ -26,6 +28,23 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv1i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v10
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv1i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
@@ -69,6 +88,23 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv2i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v10
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv2i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
@@ -112,6 +148,23 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv4i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT: vnsrl.wi v9, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv4i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
@@ -155,6 +208,23 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv8i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT: vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vsub.vx v8, v9, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv8i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, m1, ta, ma
@@ -198,6 +268,23 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv16i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v10, v8, v10
+; CHECK-F-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v10
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT: vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v10, v12, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: vsub.vx v8, v10, a0
+; CHECK-F-NEXT: vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv16i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, m2, ta, ma
@@ -328,6 +415,20 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv1i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v9, v9, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv1i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
@@ -403,6 +504,20 @@ define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv2i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v9, v9, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv2i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
@@ -478,6 +593,20 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv4i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v9, v9, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv4i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma
@@ -553,6 +682,20 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv8i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v10, v8, v10
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT: vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v10, v10, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v10, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv8i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma
@@ -628,6 +771,20 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv16i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v12, v8, 0
+; CHECK-F-NEXT: vand.vv v12, v8, v12
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT: vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v12, v12, a0
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a0, 16
+; CHECK-F-NEXT: vmerge.vxm v8, v12, a0, v0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv16i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma
@@ -766,6 +923,23 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv1i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; CHECK-F-NEXT: vsrl.vi v9, v9, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v9, v9, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv1i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
@@ -847,6 +1021,23 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv2i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v9, v8, v9
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; CHECK-F-NEXT: vsrl.vi v9, v9, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v9, v9, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv2i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma
@@ -928,6 +1119,23 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv4i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v10, v8, v10
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; CHECK-F-NEXT: vsrl.vi v10, v10, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v10, v10, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv4i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma
@@ -1009,6 +1217,23 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_nxv8i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v12, v8, 0
+; CHECK-F-NEXT: vand.vv v12, v8, v12
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v12, v12, v0.t
+; CHECK-F-NEXT: vsrl.vi v12, v12, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v12, v12, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_nxv8i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma
@@ -1032,387 +1257,733 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
declare <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32>, i1)
define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: cttz_nxv16i32:
-; RV32: # %bb.0:
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_nxv16i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT: vsub.vx v16, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v16, v16, a0
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v16, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_nxv16i32:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_nxv16i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV64I-NEXT: vsub.vx v16, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v16, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: cttz_nxv16i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT: vrsub.vi v16, v8, 0
+; CHECK-F-NEXT: vand.vv v16, v8, v16
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v16, v16, v0.t
+; CHECK-F-NEXT: vsrl.vi v16, v16, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v16, v16, a1
+; CHECK-F-NEXT: vmseq.vi v0, v8, 0
+; CHECK-F-NEXT: li a1, 32
+; CHECK-F-NEXT: vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: cttz_nxv16i32:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT: vrsub.vi v16, v8, 0
+; CHECK-D-NEXT: vand.vv v16, v8, v16
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v16, v16, v0.t
+; CHECK-D-NEXT: vsrl.vi v16, v16, 23
+; CHECK-D-NEXT: li a1, 127
+; CHECK-D-NEXT: vsub.vx v16, v16, a1
+; CHECK-D-NEXT: vmseq.vi v0, v8, 0
+; CHECK-D-NEXT: li a1, 32
+; CHECK-D-NEXT: vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
ret <vscale x 16 x i32> %a
}
declare <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32>, i1)
define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: cttz_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 1
-; RV32-NEXT: vand.vv v10, v11, v10
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v11
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_nxv1i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 1
+; RV32I-NEXT: vand.vv v10, v11, v10
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vmul.vv v8, v8, v10
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_nxv1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: lui a0, %hi(.LCPI18_0)
-; RV64-NEXT: ld a0, %lo(.LCPI18_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI18_1)
-; RV64-NEXT: ld a1, %lo(.LCPI18_1)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: vand.vx v9, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: lui a0, %hi(.LCPI18_2)
-; RV64-NEXT: ld a0, %lo(.LCPI18_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI18_3)
-; RV64-NEXT: ld a1, %lo(.LCPI18_3)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_nxv1i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: lui a0, %hi(.LCPI18_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI18_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI18_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI18_1)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v9, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI18_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI18_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI18_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI18_3)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; RV32F-LABEL: cttz_nxv1i64:
+; RV32F: # %bb.0:
+; RV32F-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32F-NEXT: vmseq.vx v9, v8, zero
+; RV32F-NEXT: vrsub.vi v10, v8, 0
+; RV32F-NEXT: vand.vv v8, v8, v10
+; RV32F-NEXT: vmset.m v0
+; RV32F-NEXT: fsrmi a0, 1
+; RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32F-NEXT: vfncvt.f.xu.w v10, v8, v0.t
+; RV32F-NEXT: vsrl.vi v8, v10, 23
+; RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; RV32F-NEXT: vzext.vf2 v10, v8
+; RV32F-NEXT: li a1, 127
+; RV32F-NEXT: vsub.vx v8, v10, a1
+; RV32F-NEXT: li a1, 64
+; RV32F-NEXT: vmv.v.v v0, v9
+; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT: fsrm a0
+; RV32F-NEXT: ret
+;
+; RV64F-LABEL: cttz_nxv1i64:
+; RV64F: # %bb.0:
+; RV64F-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64F-NEXT: vrsub.vi v9, v8, 0
+; RV64F-NEXT: vand.vv v9, v8, v9
+; RV64F-NEXT: vmset.m v0
+; RV64F-NEXT: fsrmi a0, 1
+; RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV64F-NEXT: vfncvt.f.xu.w v10, v9, v0.t
+; RV64F-NEXT: vsrl.vi v9, v10, 23
+; RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; RV64F-NEXT: vzext.vf2 v10, v9
+; RV64F-NEXT: li a1, 127
+; RV64F-NEXT: vsub.vx v9, v10, a1
+; RV64F-NEXT: vmseq.vi v0, v8, 0
+; RV64F-NEXT: li a1, 64
+; RV64F-NEXT: vmerge.vxm v8, v9, a1, v0
+; RV64F-NEXT: fsrm a0
+; RV64F-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv1i64:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32D-NEXT: vmseq.vx v9, v8, zero
+; RV32D-NEXT: vrsub.vi v10, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v10
+; RV32D-NEXT: vmset.m v0
+; RV32D-NEXT: fsrmi a0, 1
+; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT: li a1, 52
+; RV32D-NEXT: vsrl.vx v8, v8, a1
+; RV32D-NEXT: li a1, 1023
+; RV32D-NEXT: vsub.vx v8, v8, a1
+; RV32D-NEXT: li a1, 64
+; RV32D-NEXT: vmv.v.v v0, v9
+; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT: fsrm a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv1i64:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64D-NEXT: vrsub.vi v9, v8, 0
+; RV64D-NEXT: vand.vv v9, v8, v9
+; RV64D-NEXT: vmset.m v0
+; RV64D-NEXT: fsrmi a0, 1
+; RV64D-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; RV64D-NEXT: li a1, 52
+; RV64D-NEXT: vsrl.vx v9, v9, a1
+; RV64D-NEXT: li a1, 1023
+; RV64D-NEXT: vsub.vx v9, v9, a1
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: li a1, 64
+; RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; RV64D-NEXT: fsrm a0
+; RV64D-NEXT: ret
%a = call <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 false)
ret <vscale x 1 x i64> %a
}
declare <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64>, i1)
define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: cttz_nxv2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 1
-; RV32-NEXT: vand.vv v12, v14, v12
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v14
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vmul.vv v8, v8, v12
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_nxv2i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 1
+; RV32I-NEXT: vand.vv v12, v14, v12
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vmul.vv v8, v8, v12
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_nxv2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: lui a0, %hi(.LCPI19_0)
-; RV64-NEXT: ld a0, %lo(.LCPI19_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI19_1)
-; RV64-NEXT: ld a1, %lo(.LCPI19_1)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: vand.vx v10, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: lui a0, %hi(.LCPI19_2)
-; RV64-NEXT: ld a0, %lo(.LCPI19_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI19_3)
-; RV64-NEXT: ld a1, %lo(.LCPI19_3)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_nxv2i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: lui a0, %hi(.LCPI19_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI19_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI19_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI19_1)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v10, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI19_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI19_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI19_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI19_3)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; RV32F-LABEL: cttz_nxv2i64:
+; RV32F: # %bb.0:
+; RV32F-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32F-NEXT: vmseq.vx v10, v8, zero
+; RV32F-NEXT: vrsub.vi v12, v8, 0
+; RV32F-NEXT: vand.vv v8, v8, v12
+; RV32F-NEXT: vmset.m v0
+; RV32F-NEXT: fsrmi a0, 1
+; RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV32F-NEXT: vfncvt.f.xu.w v11, v8, v0.t
+; RV32F-NEXT: vsrl.vi v8, v11, 23
+; RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV32F-NEXT: vzext.vf2 v12, v8
+; RV32F-NEXT: li a1, 127
+; RV32F-NEXT: vsub.vx v8, v12, a1
+; RV32F-NEXT: li a1, 64
+; RV32F-NEXT: vmv1r.v v0, v10
+; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT: fsrm a0
+; RV32F-NEXT: ret
+;
+; RV64F-LABEL: cttz_nxv2i64:
+; RV64F: # %bb.0:
+; RV64F-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV64F-NEXT: vrsub.vi v10, v8, 0
+; RV64F-NEXT: vand.vv v10, v8, v10
+; RV64F-NEXT: vmset.m v0
+; RV64F-NEXT: fsrmi a0, 1
+; RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64F-NEXT: vfncvt.f.xu.w v12, v10, v0.t
+; RV64F-NEXT: vsrl.vi v10, v12, 23
+; RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64F-NEXT: vzext.vf2 v12, v10
+; RV64F-NEXT: li a1, 127
+; RV64F-NEXT: vsub.vx v10, v12, a1
+; RV64F-NEXT: vmseq.vi v0, v8, 0
+; RV64F-NEXT: li a1, 64
+; RV64F-NEXT: vmerge.vxm v8, v10, a1, v0
+; RV64F-NEXT: fsrm a0
+; RV64F-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv2i64:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV32D-NEXT: vmseq.vx v10, v8, zero
+; RV32D-NEXT: vrsub.vi v12, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v12
+; RV32D-NEXT: vmset.m v0
+; RV32D-NEXT: fsrmi a0, 1
+; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT: li a1, 52
+; RV32D-NEXT: vsrl.vx v8, v8, a1
+; RV32D-NEXT: li a1, 1023
+; RV32D-NEXT: vsub.vx v8, v8, a1
+; RV32D-NEXT: li a1, 64
+; RV32D-NEXT: vmv1r.v v0, v10
+; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT: fsrm a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv2i64:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV64D-NEXT: vrsub.vi v10, v8, 0
+; RV64D-NEXT: vand.vv v10, v8, v10
+; RV64D-NEXT: vmset.m v0
+; RV64D-NEXT: fsrmi a0, 1
+; RV64D-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; RV64D-NEXT: li a1, 52
+; RV64D-NEXT: vsrl.vx v10, v10, a1
+; RV64D-NEXT: li a1, 1023
+; RV64D-NEXT: vsub.vx v10, v10, a1
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: li a1, 64
+; RV64D-NEXT: vmerge.vxm v8, v10, a1, v0
+; RV64D-NEXT: fsrm a0
+; RV64D-NEXT: ret
%a = call <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 false)
ret <vscale x 2 x i64> %a
}
declare <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64>, i1)
define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: cttz_nxv4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 1
-; RV32-NEXT: vand.vv v16, v20, v16
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v20
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_nxv4i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 1
+; RV32I-NEXT: vand.vv v16, v20, v16
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v20
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vmul.vv v8, v8, v16
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_nxv4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: lui a0, %hi(.LCPI20_0)
-; RV64-NEXT: ld a0, %lo(.LCPI20_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI20_1)
-; RV64-NEXT: ld a1, %lo(.LCPI20_1)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: vand.vx v12, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: lui a0, %hi(.LCPI20_2)
-; RV64-NEXT: ld a0, %lo(.LCPI20_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI20_3)
-; RV64-NEXT: ld a1, %lo(.LCPI20_3)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_nxv4i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT: vsub.vx v12, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: lui a0, %hi(.LCPI20_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI20_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI20_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI20_1)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v12, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI20_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI20_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI20_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI20_3)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; RV32F-LABEL: cttz_nxv4i64:
+; RV32F: # %bb.0:
+; RV32F-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32F-NEXT: vmseq.vx v12, v8, zero
+; RV32F-NEXT: vrsub.vi v16, v8, 0
+; RV32F-NEXT: vand.vv v8, v8, v16
+; RV32F-NEXT: vmset.m v0
+; RV32F-NEXT: fsrmi a0, 1
+; RV32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV32F-NEXT: vfncvt.f.xu.w v14, v8, v0.t
+; RV32F-NEXT: vsrl.vi v8, v14, 23
+; RV32F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; RV32F-NEXT: vzext.vf2 v16, v8
+; RV32F-NEXT: li a1, 127
+; RV32F-NEXT: vsub.vx v8, v16, a1
+; RV32F-NEXT: li a1, 64
+; RV32F-NEXT: vmv1r.v v0, v12
+; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT: fsrm a0
+; RV32F-NEXT: ret
+;
+; RV64F-LABEL: cttz_nxv4i64:
+; RV64F: # %bb.0:
+; RV64F-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV64F-NEXT: vrsub.vi v12, v8, 0
+; RV64F-NEXT: vand.vv v12, v8, v12
+; RV64F-NEXT: vmset.m v0
+; RV64F-NEXT: fsrmi a0, 1
+; RV64F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; RV64F-NEXT: vfncvt.f.xu.w v16, v12, v0.t
+; RV64F-NEXT: vsrl.vi v12, v16, 23
+; RV64F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; RV64F-NEXT: vzext.vf2 v16, v12
+; RV64F-NEXT: li a1, 127
+; RV64F-NEXT: vsub.vx v12, v16, a1
+; RV64F-NEXT: vmseq.vi v0, v8, 0
+; RV64F-NEXT: li a1, 64
+; RV64F-NEXT: vmerge.vxm v8, v12, a1, v0
+; RV64F-NEXT: fsrm a0
+; RV64F-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv4i64:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32D-NEXT: vmseq.vx v12, v8, zero
+; RV32D-NEXT: vrsub.vi v16, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v16
+; RV32D-NEXT: vmset.m v0
+; RV32D-NEXT: fsrmi a0, 1
+; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT: li a1, 52
+; RV32D-NEXT: vsrl.vx v8, v8, a1
+; RV32D-NEXT: li a1, 1023
+; RV32D-NEXT: vsub.vx v8, v8, a1
+; RV32D-NEXT: li a1, 64
+; RV32D-NEXT: vmv1r.v v0, v12
+; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT: fsrm a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv4i64:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV64D-NEXT: vrsub.vi v12, v8, 0
+; RV64D-NEXT: vand.vv v12, v8, v12
+; RV64D-NEXT: vmset.m v0
+; RV64D-NEXT: fsrmi a0, 1
+; RV64D-NEXT: vfcvt.f.xu.v v12, v12, v0.t
+; RV64D-NEXT: li a1, 52
+; RV64D-NEXT: vsrl.vx v12, v12, a1
+; RV64D-NEXT: li a1, 1023
+; RV64D-NEXT: vsub.vx v12, v12, a1
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: li a1, 64
+; RV64D-NEXT: vmerge.vxm v8, v12, a1, v0
+; RV64D-NEXT: fsrm a0
+; RV64D-NEXT: ret
%a = call <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
ret <vscale x 4 x i64> %a
}
declare <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64>, i1)
define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: cttz_nxv8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: vand.vv v24, v0, v24
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmul.vv v8, v8, v24
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_nxv8i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsub.vx v16, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 1
+; RV32I-NEXT: vand.vv v24, v0, v24
+; RV32I-NEXT: vsub.vv v8, v8, v24
+; RV32I-NEXT: vand.vv v24, v8, v16
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v24, v8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v0
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vmul.vv v8, v8, v24
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_nxv8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: lui a0, %hi(.LCPI21_0)
-; RV64-NEXT: ld a0, %lo(.LCPI21_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI21_1)
-; RV64-NEXT: ld a1, %lo(.LCPI21_1)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: vand.vx v16, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: lui a0, %hi(.LCPI21_2)
-; RV64-NEXT: ld a0, %lo(.LCPI21_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI21_3)
-; RV64-NEXT: ld a1, %lo(.LCPI21_3)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_nxv8i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT: vsub.vx v16, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: lui a0, %hi(.LCPI21_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI21_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI21_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI21_1)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v16, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI21_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI21_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI21_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI21_3)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; RV32F-LABEL: cttz_nxv8i64:
+; RV32F: # %bb.0:
+; RV32F-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32F-NEXT: vmseq.vx v16, v8, zero
+; RV32F-NEXT: vrsub.vi v24, v8, 0
+; RV32F-NEXT: vand.vv v8, v8, v24
+; RV32F-NEXT: vmset.m v0
+; RV32F-NEXT: fsrmi a0, 1
+; RV32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; RV32F-NEXT: vfncvt.f.xu.w v20, v8, v0.t
+; RV32F-NEXT: vsrl.vi v8, v20, 23
+; RV32F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV32F-NEXT: vzext.vf2 v24, v8
+; RV32F-NEXT: li a1, 127
+; RV32F-NEXT: vsub.vx v8, v24, a1
+; RV32F-NEXT: li a1, 64
+; RV32F-NEXT: vmv1r.v v0, v16
+; RV32F-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT: fsrm a0
+; RV32F-NEXT: ret
+;
+; RV64F-LABEL: cttz_nxv8i64:
+; RV64F: # %bb.0:
+; RV64F-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV64F-NEXT: vrsub.vi v16, v8, 0
+; RV64F-NEXT: vand.vv v16, v8, v16
+; RV64F-NEXT: vmset.m v0
+; RV64F-NEXT: fsrmi a0, 1
+; RV64F-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; RV64F-NEXT: vfncvt.f.xu.w v24, v16, v0.t
+; RV64F-NEXT: vsrl.vi v16, v24, 23
+; RV64F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; RV64F-NEXT: vzext.vf2 v24, v16
+; RV64F-NEXT: li a1, 127
+; RV64F-NEXT: vsub.vx v16, v24, a1
+; RV64F-NEXT: vmseq.vi v0, v8, 0
+; RV64F-NEXT: li a1, 64
+; RV64F-NEXT: vmerge.vxm v8, v16, a1, v0
+; RV64F-NEXT: fsrm a0
+; RV64F-NEXT: ret
+;
+; RV32D-LABEL: cttz_nxv8i64:
+; RV32D: # %bb.0:
+; RV32D-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV32D-NEXT: vmseq.vx v16, v8, zero
+; RV32D-NEXT: vrsub.vi v24, v8, 0
+; RV32D-NEXT: vand.vv v8, v8, v24
+; RV32D-NEXT: vmset.m v0
+; RV32D-NEXT: fsrmi a0, 1
+; RV32D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT: li a1, 52
+; RV32D-NEXT: vsrl.vx v8, v8, a1
+; RV32D-NEXT: li a1, 1023
+; RV32D-NEXT: vsub.vx v8, v8, a1
+; RV32D-NEXT: li a1, 64
+; RV32D-NEXT: vmv1r.v v0, v16
+; RV32D-NEXT: vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT: fsrm a0
+; RV32D-NEXT: ret
+;
+; RV64D-LABEL: cttz_nxv8i64:
+; RV64D: # %bb.0:
+; RV64D-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV64D-NEXT: vrsub.vi v16, v8, 0
+; RV64D-NEXT: vand.vv v16, v8, v16
+; RV64D-NEXT: vmset.m v0
+; RV64D-NEXT: fsrmi a0, 1
+; RV64D-NEXT: vfcvt.f.xu.v v16, v16, v0.t
+; RV64D-NEXT: li a1, 52
+; RV64D-NEXT: vsrl.vx v16, v16, a1
+; RV64D-NEXT: li a1, 1023
+; RV64D-NEXT: vsub.vx v16, v16, a1
+; RV64D-NEXT: vmseq.vi v0, v8, 0
+; RV64D-NEXT: li a1, 64
+; RV64D-NEXT: vmerge.vxm v8, v16, a1, v0
+; RV64D-NEXT: fsrm a0
+; RV64D-NEXT: ret
%a = call <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 false)
ret <vscale x 8 x i64> %a
}
@@ -1440,6 +2011,21 @@ define <vscale x 1 x i8> @cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv1i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, mf8, ta, ma
@@ -1480,6 +2066,21 @@ define <vscale x 2 x i8> @cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv2i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, mf4, ta, ma
@@ -1520,6 +2121,21 @@ define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT: vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv4i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
@@ -1560,6 +2176,21 @@ define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT: vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v10, v8, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v10, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv8i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, m1, ta, ma
@@ -1600,6 +2231,21 @@ define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
; CHECK-ZVE64X-NEXT: vand.vi v8, v8, 15
; CHECK-ZVE64X-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv16i8:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v10
+; CHECK-F-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v8
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT: vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT: vnsrl.wi v12, v8, 0
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v12, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv16i8:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e8, m2, ta, ma
@@ -1725,6 +2371,17 @@ define <vscale x 1 x i16> @cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv1i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
@@ -1796,6 +2453,17 @@ define <vscale x 2 x i16> @cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv2i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
@@ -1867,6 +2535,17 @@ define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv4i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m1, ta, ma
@@ -1938,6 +2617,17 @@ define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v10
+; CHECK-F-NEXT: vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv8i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m2, ta, ma
@@ -2009,6 +2699,17 @@ define <vscale x 16 x i16> @cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 8
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv16i16:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v12, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v12
+; CHECK-F-NEXT: vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT: vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT: li a0, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv16i16:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e16, m4, ta, ma
@@ -2142,6 +2843,20 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv1i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
@@ -2219,6 +2934,20 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv2i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m1, ta, ma
@@ -2296,6 +3025,20 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v10
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv4i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m2, ta, ma
@@ -2373,6 +3116,20 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
; RV64I-NEXT: vsrl.vi v8, v8, 24
; RV64I-NEXT: ret
;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v12, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v12
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
; CHECK-D-LABEL: cttz_zero_undef_nxv8i32:
; CHECK-D: # %bb.0:
; CHECK-D-NEXT: vsetvli a0, zero, e32, m4, ta, ma
@@ -2392,383 +3149,539 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
}
define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv16i32:
-; RV32: # %bb.0:
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vsrl.vi v16, v8, 1
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: vand.vx v16, v16, a0
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: vand.vx v16, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vsrl.vi v16, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v16
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: vand.vx v8, v8, a0
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: vmul.vx v8, v8, a0
-; RV32-NEXT: vsrl.vi v8, v8, 24
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv16i32:
+; RV32I: # %bb.0:
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT: vsub.vx v16, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vsrl.vi v16, v8, 1
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: vand.vx v16, v16, a0
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: vand.vx v16, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vsrl.vi v16, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v16
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: vand.vx v8, v8, a0
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: vmul.vx v8, v8, a0
+; RV32I-NEXT: vsrl.vi v8, v8, 24
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv16i32:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: lui a0, 349525
-; RV64-NEXT: addiw a0, a0, 1365
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: lui a0, 209715
-; RV64-NEXT: addiw a0, a0, 819
-; RV64-NEXT: vand.vx v16, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: lui a0, 61681
-; RV64-NEXT: addiw a0, a0, -241
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: lui a0, 4112
-; RV64-NEXT: addiw a0, a0, 257
-; RV64-NEXT: vmul.vx v8, v8, a0
-; RV64-NEXT: vsrl.vi v8, v8, 24
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv16i32:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV64I-NEXT: vsub.vx v16, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: lui a0, 349525
+; RV64I-NEXT: addiw a0, a0, 1365
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 209715
+; RV64I-NEXT: addiw a0, a0, 819
+; RV64I-NEXT: vand.vx v16, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: lui a0, 61681
+; RV64I-NEXT: addiw a0, a0, -241
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: lui a0, 4112
+; RV64I-NEXT: addiw a0, a0, 257
+; RV64I-NEXT: vmul.vx v8, v8, a0
+; RV64I-NEXT: vsrl.vi v8, v8, 24
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv16i32:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT: vrsub.vi v16, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v16
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v8, 23
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v8, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv16i32:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT: vrsub.vi v16, v8, 0
+; CHECK-D-NEXT: vand.vv v8, v8, v16
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: vsrl.vi v8, v8, 23
+; CHECK-D-NEXT: li a1, 127
+; CHECK-D-NEXT: vsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 true)
ret <vscale x 16 x i32> %a
}
define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv1i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT: vsub.vx v9, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 1
-; RV32-NEXT: vand.vv v10, v11, v10
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vand.vv v10, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vadd.vv v8, v10, v8
-; RV32-NEXT: vlse64.v v9, (a0), zero
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v11, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v11
-; RV32-NEXT: vand.vv v8, v8, v9
-; RV32-NEXT: vmul.vv v8, v8, v10
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv1i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT: vsub.vx v9, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 1
+; RV32I-NEXT: vand.vv v10, v11, v10
+; RV32I-NEXT: vsub.vv v8, v8, v10
+; RV32I-NEXT: vand.vv v10, v8, v9
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vadd.vv v8, v10, v8
+; RV32I-NEXT: vlse64.v v9, (a0), zero
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v11, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v11
+; RV32I-NEXT: vand.vv v8, v8, v9
+; RV32I-NEXT: vmul.vv v8, v8, v10
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv1i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT: vsub.vx v9, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v9
-; RV64-NEXT: lui a0, %hi(.LCPI40_0)
-; RV64-NEXT: ld a0, %lo(.LCPI40_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI40_1)
-; RV64-NEXT: ld a1, %lo(.LCPI40_1)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 1
-; RV64-NEXT: vand.vx v9, v9, a0
-; RV64-NEXT: vsub.vv v8, v8, v9
-; RV64-NEXT: vand.vx v9, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v9, v8
-; RV64-NEXT: lui a0, %hi(.LCPI40_2)
-; RV64-NEXT: ld a0, %lo(.LCPI40_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI40_3)
-; RV64-NEXT: ld a1, %lo(.LCPI40_3)(a1)
-; RV64-NEXT: vsrl.vi v9, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v9
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv1i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT: vsub.vx v9, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v9
+; RV64I-NEXT: lui a0, %hi(.LCPI40_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI40_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI40_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI40_1)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 1
+; RV64I-NEXT: vand.vx v9, v9, a0
+; RV64I-NEXT: vsub.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v9, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v9, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI40_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI40_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI40_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI40_3)(a1)
+; RV64I-NEXT: vsrl.vi v9, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v9
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vrsub.vi v9, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v9
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT: vfncvt.f.xu.w v9, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v9, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v9, v8
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v9, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv1i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-D-NEXT: vrsub.vi v9, v8, 0
+; CHECK-D-NEXT: vand.vv v8, v8, v9
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1023
+; CHECK-D-NEXT: vsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 true)
ret <vscale x 1 x i64> %a
}
define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv2i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT: vsub.vx v10, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 1
-; RV32-NEXT: vand.vv v12, v14, v12
-; RV32-NEXT: vsub.vv v8, v8, v12
-; RV32-NEXT: vand.vv v12, v8, v10
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vadd.vv v8, v12, v8
-; RV32-NEXT: vlse64.v v10, (a0), zero
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v14, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v14
-; RV32-NEXT: vand.vv v8, v8, v10
-; RV32-NEXT: vmul.vv v8, v8, v12
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv2i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT: vsub.vx v10, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 1
+; RV32I-NEXT: vand.vv v12, v14, v12
+; RV32I-NEXT: vsub.vv v8, v8, v12
+; RV32I-NEXT: vand.vv v12, v8, v10
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vadd.vv v8, v12, v8
+; RV32I-NEXT: vlse64.v v10, (a0), zero
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v14, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v14
+; RV32I-NEXT: vand.vv v8, v8, v10
+; RV32I-NEXT: vmul.vv v8, v8, v12
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv2i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT: vsub.vx v10, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v10
-; RV64-NEXT: lui a0, %hi(.LCPI41_0)
-; RV64-NEXT: ld a0, %lo(.LCPI41_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI41_1)
-; RV64-NEXT: ld a1, %lo(.LCPI41_1)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 1
-; RV64-NEXT: vand.vx v10, v10, a0
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: vand.vx v10, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v10, v8
-; RV64-NEXT: lui a0, %hi(.LCPI41_2)
-; RV64-NEXT: ld a0, %lo(.LCPI41_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI41_3)
-; RV64-NEXT: ld a1, %lo(.LCPI41_3)(a1)
-; RV64-NEXT: vsrl.vi v10, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv2i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT: vsub.vx v10, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v10
+; RV64I-NEXT: lui a0, %hi(.LCPI41_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI41_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI41_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI41_1)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 1
+; RV64I-NEXT: vand.vx v10, v10, a0
+; RV64I-NEXT: vsub.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v10, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v10, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI41_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI41_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI41_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI41_3)(a1)
+; RV64I-NEXT: vsrl.vi v10, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v10
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vrsub.vi v10, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v10
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT: vfncvt.f.xu.w v10, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v10, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v10, v8
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v10, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv2i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-D-NEXT: vrsub.vi v10, v8, 0
+; CHECK-D-NEXT: vand.vv v8, v8, v10
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1023
+; CHECK-D-NEXT: vsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 true)
ret <vscale x 2 x i64> %a
}
define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv4i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT: vsub.vx v12, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 1
-; RV32-NEXT: vand.vv v16, v20, v16
-; RV32-NEXT: vsub.vv v8, v8, v16
-; RV32-NEXT: vand.vv v16, v8, v12
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vadd.vv v8, v16, v8
-; RV32-NEXT: vlse64.v v12, (a0), zero
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v20, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v20
-; RV32-NEXT: vand.vv v8, v8, v12
-; RV32-NEXT: vmul.vv v8, v8, v16
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv4i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT: vsub.vx v12, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 1
+; RV32I-NEXT: vand.vv v16, v20, v16
+; RV32I-NEXT: vsub.vv v8, v8, v16
+; RV32I-NEXT: vand.vv v16, v8, v12
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vadd.vv v8, v16, v8
+; RV32I-NEXT: vlse64.v v12, (a0), zero
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v20, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v20
+; RV32I-NEXT: vand.vv v8, v8, v12
+; RV32I-NEXT: vmul.vv v8, v8, v16
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv4i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT: vsub.vx v12, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v12
-; RV64-NEXT: lui a0, %hi(.LCPI42_0)
-; RV64-NEXT: ld a0, %lo(.LCPI42_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI42_1)
-; RV64-NEXT: ld a1, %lo(.LCPI42_1)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 1
-; RV64-NEXT: vand.vx v12, v12, a0
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: vand.vx v12, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v12, v8
-; RV64-NEXT: lui a0, %hi(.LCPI42_2)
-; RV64-NEXT: ld a0, %lo(.LCPI42_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI42_3)
-; RV64-NEXT: ld a1, %lo(.LCPI42_3)(a1)
-; RV64-NEXT: vsrl.vi v12, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v12
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv4i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT: vsub.vx v12, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v12
+; RV64I-NEXT: lui a0, %hi(.LCPI42_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI42_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI42_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI42_1)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 1
+; RV64I-NEXT: vand.vx v12, v12, a0
+; RV64I-NEXT: vsub.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v12, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v12, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI42_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI42_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI42_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI42_3)(a1)
+; RV64I-NEXT: vsrl.vi v12, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v12
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vrsub.vi v12, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v12
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT: vfncvt.f.xu.w v12, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v12, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v12, v8
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v12, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv4i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-D-NEXT: vrsub.vi v12, v8, 0
+; CHECK-D-NEXT: vand.vv v8, v8, v12
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1023
+; CHECK-D-NEXT: vsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 true)
ret <vscale x 4 x i64> %a
}
define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv8i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: lui a0, 349525
-; RV32-NEXT: addi a0, a0, 1365
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 209715
-; RV32-NEXT: addi a0, a0, 819
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 61681
-; RV32-NEXT: addi a0, a0, -241
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: lui a0, 4112
-; RV32-NEXT: addi a0, a0, 257
-; RV32-NEXT: sw a0, 12(sp)
-; RV32-NEXT: sw a0, 8(sp)
-; RV32-NEXT: li a0, 1
-; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT: vsub.vx v16, v8, a0
-; RV32-NEXT: vnot.v v8, v8
-; RV32-NEXT: addi a0, sp, 8
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 1
-; RV32-NEXT: vand.vv v24, v0, v24
-; RV32-NEXT: vsub.vv v8, v8, v24
-; RV32-NEXT: vand.vv v24, v8, v16
-; RV32-NEXT: vsrl.vi v8, v8, 2
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vadd.vv v8, v24, v8
-; RV32-NEXT: vlse64.v v16, (a0), zero
-; RV32-NEXT: vlse64.v v24, (a0), zero
-; RV32-NEXT: vsrl.vi v0, v8, 4
-; RV32-NEXT: vadd.vv v8, v8, v0
-; RV32-NEXT: vand.vv v8, v8, v16
-; RV32-NEXT: vmul.vv v8, v8, v24
-; RV32-NEXT: li a0, 56
-; RV32-NEXT: vsrl.vx v8, v8, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
+; RV32I-LABEL: cttz_zero_undef_nxv8i64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: lui a0, 349525
+; RV32I-NEXT: addi a0, a0, 1365
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 209715
+; RV32I-NEXT: addi a0, a0, 819
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 61681
+; RV32I-NEXT: addi a0, a0, -241
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: lui a0, 4112
+; RV32I-NEXT: addi a0, a0, 257
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a0, 8(sp)
+; RV32I-NEXT: li a0, 1
+; RV32I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT: vsub.vx v16, v8, a0
+; RV32I-NEXT: vnot.v v8, v8
+; RV32I-NEXT: addi a0, sp, 8
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 1
+; RV32I-NEXT: vand.vv v24, v0, v24
+; RV32I-NEXT: vsub.vv v8, v8, v24
+; RV32I-NEXT: vand.vv v24, v8, v16
+; RV32I-NEXT: vsrl.vi v8, v8, 2
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vadd.vv v8, v24, v8
+; RV32I-NEXT: vlse64.v v16, (a0), zero
+; RV32I-NEXT: vlse64.v v24, (a0), zero
+; RV32I-NEXT: vsrl.vi v0, v8, 4
+; RV32I-NEXT: vadd.vv v8, v8, v0
+; RV32I-NEXT: vand.vv v8, v8, v16
+; RV32I-NEXT: vmul.vv v8, v8, v24
+; RV32I-NEXT: li a0, 56
+; RV32I-NEXT: vsrl.vx v8, v8, a0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
;
-; RV64-LABEL: cttz_zero_undef_nxv8i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a0, 1
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsub.vx v16, v8, a0
-; RV64-NEXT: vnot.v v8, v8
-; RV64-NEXT: vand.vv v8, v8, v16
-; RV64-NEXT: lui a0, %hi(.LCPI43_0)
-; RV64-NEXT: ld a0, %lo(.LCPI43_0)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI43_1)
-; RV64-NEXT: ld a1, %lo(.LCPI43_1)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 1
-; RV64-NEXT: vand.vx v16, v16, a0
-; RV64-NEXT: vsub.vv v8, v8, v16
-; RV64-NEXT: vand.vx v16, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 2
-; RV64-NEXT: vand.vx v8, v8, a1
-; RV64-NEXT: vadd.vv v8, v16, v8
-; RV64-NEXT: lui a0, %hi(.LCPI43_2)
-; RV64-NEXT: ld a0, %lo(.LCPI43_2)(a0)
-; RV64-NEXT: lui a1, %hi(.LCPI43_3)
-; RV64-NEXT: ld a1, %lo(.LCPI43_3)(a1)
-; RV64-NEXT: vsrl.vi v16, v8, 4
-; RV64-NEXT: vadd.vv v8, v8, v16
-; RV64-NEXT: vand.vx v8, v8, a0
-; RV64-NEXT: vmul.vx v8, v8, a1
-; RV64-NEXT: li a0, 56
-; RV64-NEXT: vsrl.vx v8, v8, a0
-; RV64-NEXT: ret
+; RV64I-LABEL: cttz_zero_undef_nxv8i64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: li a0, 1
+; RV64I-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT: vsub.vx v16, v8, a0
+; RV64I-NEXT: vnot.v v8, v8
+; RV64I-NEXT: vand.vv v8, v8, v16
+; RV64I-NEXT: lui a0, %hi(.LCPI43_0)
+; RV64I-NEXT: ld a0, %lo(.LCPI43_0)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI43_1)
+; RV64I-NEXT: ld a1, %lo(.LCPI43_1)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 1
+; RV64I-NEXT: vand.vx v16, v16, a0
+; RV64I-NEXT: vsub.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v16, v8, a1
+; RV64I-NEXT: vsrl.vi v8, v8, 2
+; RV64I-NEXT: vand.vx v8, v8, a1
+; RV64I-NEXT: vadd.vv v8, v16, v8
+; RV64I-NEXT: lui a0, %hi(.LCPI43_2)
+; RV64I-NEXT: ld a0, %lo(.LCPI43_2)(a0)
+; RV64I-NEXT: lui a1, %hi(.LCPI43_3)
+; RV64I-NEXT: ld a1, %lo(.LCPI43_3)(a1)
+; RV64I-NEXT: vsrl.vi v16, v8, 4
+; RV64I-NEXT: vadd.vv v8, v8, v16
+; RV64I-NEXT: vand.vx v8, v8, a0
+; RV64I-NEXT: vmul.vx v8, v8, a1
+; RV64I-NEXT: li a0, 56
+; RV64I-NEXT: vsrl.vx v8, v8, a0
+; RV64I-NEXT: ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i64:
+; CHECK-F: # %bb.0:
+; CHECK-F-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vrsub.vi v16, v8, 0
+; CHECK-F-NEXT: vand.vv v8, v8, v16
+; CHECK-F-NEXT: vmset.m v0
+; CHECK-F-NEXT: fsrmi a0, 1
+; CHECK-F-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT: vfncvt.f.xu.w v16, v8, v0.t
+; CHECK-F-NEXT: vsrl.vi v8, v16, 23
+; CHECK-F-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT: vzext.vf2 v16, v8
+; CHECK-F-NEXT: li a1, 127
+; CHECK-F-NEXT: vsub.vx v8, v16, a1
+; CHECK-F-NEXT: fsrm a0
+; CHECK-F-NEXT: ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv8i64:
+; CHECK-D: # %bb.0:
+; CHECK-D-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-D-NEXT: vrsub.vi v16, v8, 0
+; CHECK-D-NEXT: vand.vv v8, v8, v16
+; CHECK-D-NEXT: vmset.m v0
+; CHECK-D-NEXT: fsrmi a0, 1
+; CHECK-D-NEXT: vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT: li a1, 52
+; CHECK-D-NEXT: vsrl.vx v8, v8, a1
+; CHECK-D-NEXT: li a1, 1023
+; CHECK-D-NEXT: vsub.vx v8, v8, a1
+; CHECK-D-NEXT: fsrm a0
+; CHECK-D-NEXT: ret
%a = call <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 true)
ret <vscale x 8 x i64> %a
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index cc08aeb8cd2ba..abc68c40ad298 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64I
; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32F
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64F
; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32D
; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64D
; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
@@ -201,6 +203,34 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
; LMULMAX1-RV64-NEXT: vse16.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
;
+; LMULMAX2-RV32F-LABEL: ctlz_v8i16:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV32F-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV32F-NEXT: li a1, 142
+; LMULMAX2-RV32F-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT: li a1, 16
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
+;
+; LMULMAX2-RV64F-LABEL: ctlz_v8i16:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV64F-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV64F-NEXT: li a1, 142
+; LMULMAX2-RV64F-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 16
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
; LMULMAX2-RV32D-LABEL: ctlz_v8i16:
; LMULMAX2-RV32D: # %bb.0:
; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, ma
@@ -327,81 +357,39 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0)
; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: ctlz_v4i32:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vle32.v v8, (a0)
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vnot.v v8, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: lui a1, 349525
-; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 61681
-; LMULMAX1-RV32-NEXT: addi a1, a1, -241
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: lui a1, 4112
-; LMULMAX1-RV32-NEXT: addi a1, a1, 257
-; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: ctlz_v4i32:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT: li a1, 158
+; LMULMAX2-RV32F-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT: li a1, 32
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: ctlz_v4i32:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT: vle32.v v8, (a0)
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT: lui a1, 349525
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: lui a1, 209715
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: lui a1, 61681
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: lui a1, 4112
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 257
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT: vse32.v v8, (a0)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: ctlz_v4i32:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV64F-NEXT: li a1, 158
+; LMULMAX2-RV64F-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 32
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
;
; LMULMAX2-RV32D-LABEL: ctlz_v4i32:
; LMULMAX2-RV32D: # %bb.0:
@@ -456,258 +444,204 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v2i64:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: li a1, 32
-; LMULMAX2-RV32-NEXT: vsrl.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v9, -1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: li a1, 56
-; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: ctlz_v2i64:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 16
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: li a1, 32
+; LMULMAX2-RV32I-NEXT: vsrl.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.i v9, -1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: li a1, 56
+; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: ctlz_v2i64:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: li a1, 32
-; LMULMAX2-RV64-NEXT: vsrl.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vnot.v v8, v8
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_0)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_1)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a2
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_2)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_3)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: li a1, 56
-; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: ctlz_v2i64:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 16
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: li a1, 32
+; LMULMAX2-RV64I-NEXT: vsrl.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_0)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_0)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_1)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_1)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a2
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_2)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_2)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_3)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_3)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: li a1, 56
+; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: ctlz_v2i64:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: li a1, 32
-; LMULMAX1-RV32-NEXT: vsrl.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.i v9, -1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: lui a1, 349525
-; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 61681
-; LMULMAX1-RV32-NEXT: addi a1, a1, -241
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 4112
-; LMULMAX1-RV32-NEXT: addi a1, a1, 257
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: li a1, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: ctlz_v2i64:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v9, v8, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT: li a1, 190
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT: vwsubu.wv v10, v10, v9
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vmv.v.i v9, 0
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vmseq.vv v0, v8, v9
+; LMULMAX2-RV32F-NEXT: li a1, 64
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: ctlz_v2i64:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: li a1, 32
-; LMULMAX1-RV64-NEXT: vsrl.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_0)
-; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_1)
-; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a2
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_2)
-; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_3)
-; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT: li a1, 56
-; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: ctlz_v2i64:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v9, v8, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV64F-NEXT: li a1, 190
+; LMULMAX2-RV64F-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV64F-NEXT: vwsubu.vv v11, v10, v9
+; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 64
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v11, a1, v0
+; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v2i64:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vmset.m v0
+; LMULMAX2-RV32D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV32D-NEXT: fsrm a1
+; LMULMAX2-RV32D-NEXT: li a1, 52
+; LMULMAX2-RV32D-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: li a1, 1086
+; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32D-NEXT: vmv.v.i v10, 0
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT: vmseq.vv v0, v8, v10
+; LMULMAX2-RV32D-NEXT: li a1, 64
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v2i64:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vmset.m v0
+; LMULMAX2-RV64D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV64D-NEXT: fsrm a1
+; LMULMAX2-RV64D-NEXT: li a1, 52
+; LMULMAX2-RV64D-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: li a1, 1086
+; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: li a1, 64
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
;
; LMULMAX8-RV32-LABEL: ctlz_v2i64:
; LMULMAX8-RV32: # %bb.0:
; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: li a1, 32
-; LMULMAX8-RV32-NEXT: vsrl.vx v9, v8, a1
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.i v9, -1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX8-RV32-NEXT: lui a1, 349525
-; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: lui a1, 209715
-; LMULMAX8-RV32-NEXT: addi a1, a1, 819
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: lui a1, 61681
-; LMULMAX8-RV32-NEXT: addi a1, a1, -241
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: lui a1, 4112
-; LMULMAX8-RV32-NEXT: addi a1, a1, 257
+; LMULMAX8-RV32-NEXT: vmset.m v0
+; LMULMAX8-RV32-NEXT: fsrmi a1, 1
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX8-RV32-NEXT: fsrm a1
+; LMULMAX8-RV32-NEXT: li a1, 52
+; LMULMAX8-RV32-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: li a1, 1086
+; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT: vmv.v.i v10, 0
; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: li a1, 56
-; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vmseq.vv v0, v8, v10
+; LMULMAX8-RV32-NEXT: li a1, 64
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0
; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV32-NEXT: ret
;
@@ -715,41 +649,17 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
; LMULMAX8-RV64: # %bb.0:
; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 2
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 8
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 16
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: li a1, 32
-; LMULMAX8-RV64-NEXT: vsrl.vx v9, v8, a1
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vnot.v v8, v8
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_0)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_1)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a2
-; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_2)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_3)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: li a1, 56
-; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vmset.m v0
+; LMULMAX8-RV64-NEXT: fsrmi a1, 1
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX8-RV64-NEXT: fsrm a1
+; LMULMAX8-RV64-NEXT: li a1, 52
+; LMULMAX8-RV64-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: li a1, 1086
+; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: li a1, 64
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0
; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
@@ -1069,209 +979,149 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v8i32:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vnot.v v8, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
-;
-; LMULMAX2-RV64-LABEL: ctlz_v8i32:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vnot.v v8, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT: lui a1, 349525
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: lui a1, 209715
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: lui a1, 61681
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: lui a1, 4112
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 257
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV64-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV32I-LABEL: ctlz_v8i32:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 8
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 16
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: ctlz_v8i32:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle32.v v8, (a1)
-; LMULMAX1-RV32-NEXT: vle32.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vnot.v v8, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT: lui a2, 349525
-; LMULMAX1-RV32-NEXT: addi a2, a2, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: lui a3, 209715
-; LMULMAX1-RV32-NEXT: addi a3, a3, 819
-; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a3
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a3
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: lui a4, 61681
-; LMULMAX1-RV32-NEXT: addi a4, a4, -241
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4
-; LMULMAX1-RV32-NEXT: lui a5, 4112
-; LMULMAX1-RV32-NEXT: addi a5, a5, 257
-; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 8
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 16
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vnot.v v9, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2
-; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a3
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a3
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4
-; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a5
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 24
-; LMULMAX1-RV32-NEXT: vse32.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vse32.v v8, (a1)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV64I-LABEL: ctlz_v8i32:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 8
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 16
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT: lui a1, 349525
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: lui a1, 209715
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: lui a1, 61681
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: lui a1, 4112
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: ctlz_v8i32:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT: addi a1, a0, 16
-; LMULMAX1-RV64-NEXT: vle32.v v8, (a1)
-; LMULMAX1-RV64-NEXT: vle32.v v9, (a0)
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: lui a2, 349525
-; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: lui a3, 209715
-; LMULMAX1-RV64-NEXT: addiw a3, a3, 819
-; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a3
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a3
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: lui a4, 61681
-; LMULMAX1-RV64-NEXT: addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT: lui a5, 4112
-; LMULMAX1-RV64-NEXT: addiw a5, a5, 257
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 16
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vnot.v v9, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2
-; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a3
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a3
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 24
-; LMULMAX1-RV64-NEXT: vse32.v v9, (a0)
-; LMULMAX1-RV64-NEXT: vse32.v v8, (a1)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV32F-LABEL: ctlz_v8i32:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV32F-NEXT: li a1, 158
+; LMULMAX2-RV32F-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT: li a1, 32
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
+;
+; LMULMAX2-RV64F-LABEL: ctlz_v8i32:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV64F-NEXT: li a1, 158
+; LMULMAX2-RV64F-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 32
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v8i32:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vmset.m v0
+; LMULMAX2-RV32D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV32D-NEXT: fsrm a1
+; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV32D-NEXT: li a1, 158
+; LMULMAX2-RV32D-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT: li a1, 32
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v8i32:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vmset.m v0
+; LMULMAX2-RV64D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV64D-NEXT: fsrm a1
+; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV64D-NEXT: li a1, 158
+; LMULMAX2-RV64D-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: li a1, 32
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
;
; LMULMAX8-LABEL: ctlz_v8i32:
; LMULMAX8: # %bb.0:
@@ -1296,314 +1146,204 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v4i64:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: li a1, 32
-; LMULMAX2-RV32-NEXT: vsrl.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v10, -1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v12, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: li a1, 56
-; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: ctlz_v4i64:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 8
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 16
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: li a1, 32
+; LMULMAX2-RV32I-NEXT: vsrl.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: li a1, 56
+; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: ctlz_v4i64:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: li a1, 32
-; LMULMAX2-RV64-NEXT: vsrl.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vnot.v v8, v8
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_0)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_1)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a2
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_2)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_3)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: li a1, 56
-; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: ctlz_v4i64:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 8
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 16
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: li a1, 32
+; LMULMAX2-RV64I-NEXT: vsrl.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_0)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_0)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_1)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_1)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a2
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_2)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_2)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_3)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_3)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: li a1, 56
+; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: ctlz_v4i64:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: li a2, 32
-; LMULMAX1-RV32-NEXT: vsrl.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v11, v8, 1
-; LMULMAX1-RV32-NEXT: lui a3, 349525
-; LMULMAX1-RV32-NEXT: addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v12, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v12
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT: lui a3, 209715
-; LMULMAX1-RV32-NEXT: addi a3, a3, 819
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v11, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v13, v8, v11
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v13, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT: lui a3, 61681
-; LMULMAX1-RV32-NEXT: addi a3, a3, -241
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v13, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT: lui a3, 4112
-; LMULMAX1-RV32-NEXT: addi a3, a3, 257
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v14, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v14
-; LMULMAX1-RV32-NEXT: li a3, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a3
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 1
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 2
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 4
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 8
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vsrl.vi v15, v9, 16
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vsrl.vx v15, v9, a2
-; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vxor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v12
-; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vv v10, v9, v11
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13
-; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v14
-; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a3
-; LMULMAX1-RV32-NEXT: vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v8, (a1)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: ctlz_v4i64:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v10, v8, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV32F-NEXT: li a1, 190
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vmv.v.x v12, a1
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vwsubu.wv v12, v12, v10
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vmv.v.i v10, 0
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vmseq.vv v0, v8, v10
+; LMULMAX2-RV32F-NEXT: li a1, 64
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v12, a1, v0
+; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: ctlz_v4i64:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT: addi a1, a0, 16
-; LMULMAX1-RV64-NEXT: vle64.v v8, (a1)
-; LMULMAX1-RV64-NEXT: vle64.v v9, (a0)
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: li a2, 32
-; LMULMAX1-RV64-NEXT: vsrl.vx v10, v8, a2
-; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: lui a3, %hi(.LCPI7_0)
-; LMULMAX1-RV64-NEXT: ld a3, %lo(.LCPI7_0)(a3)
-; LMULMAX1-RV64-NEXT: lui a4, %hi(.LCPI7_1)
-; LMULMAX1-RV64-NEXT: ld a4, %lo(.LCPI7_1)(a4)
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT: lui a5, %hi(.LCPI7_2)
-; LMULMAX1-RV64-NEXT: ld a5, %lo(.LCPI7_2)(a5)
-; LMULMAX1-RV64-NEXT: lui a6, %hi(.LCPI7_3)
-; LMULMAX1-RV64-NEXT: ld a6, %lo(.LCPI7_3)(a6)
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a6
-; LMULMAX1-RV64-NEXT: li a7, 56
-; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a7
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 16
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vx v10, v9, a2
-; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vnot.v v9, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a6
-; LMULMAX1-RV64-NEXT: vsrl.vx v9, v9, a7
-; LMULMAX1-RV64-NEXT: vse64.v v9, (a0)
-; LMULMAX1-RV64-NEXT: vse64.v v8, (a1)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: ctlz_v4i64:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v10, v8, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV64F-NEXT: li a1, 190
+; LMULMAX2-RV64F-NEXT: vmv.v.x v11, a1
+; LMULMAX2-RV64F-NEXT: vwsubu.vv v12, v11, v10
+; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 64
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v12, a1, v0
+; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v4i64:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vmset.m v0
+; LMULMAX2-RV32D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV32D-NEXT: fsrm a1
+; LMULMAX2-RV32D-NEXT: li a1, 52
+; LMULMAX2-RV32D-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT: li a1, 1086
+; LMULMAX2-RV32D-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vmv.v.i v12, 0
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vmseq.vv v0, v8, v12
+; LMULMAX2-RV32D-NEXT: li a1, 64
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v4i64:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vmset.m v0
+; LMULMAX2-RV64D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV64D-NEXT: fsrm a1
+; LMULMAX2-RV64D-NEXT: li a1, 52
+; LMULMAX2-RV64D-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT: li a1, 1086
+; LMULMAX2-RV64D-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: li a1, 64
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
;
; LMULMAX8-RV32-LABEL: ctlz_v4i64:
; LMULMAX8-RV32: # %bb.0:
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: li a1, 32
-; LMULMAX8-RV32-NEXT: vsrl.vx v10, v8, a1
-; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX8-RV32-NEXT: lui a1, 349525
-; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12
-; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: lui a1, 209715
-; LMULMAX8-RV32-NEXT: addi a1, a1, 819
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: lui a1, 61681
-; LMULMAX8-RV32-NEXT: addi a1, a1, -241
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: lui a1, 4112
-; LMULMAX8-RV32-NEXT: addi a1, a1, 257
+; LMULMAX8-RV32-NEXT: vmset.m v0
+; LMULMAX8-RV32-NEXT: fsrmi a1, 1
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX8-RV32-NEXT: fsrm a1
+; LMULMAX8-RV32-NEXT: li a1, 52
+; LMULMAX8-RV32-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: li a1, 1086
+; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1
; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vmv.v.i v12, 0
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: li a1, 56
-; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vmseq.vv v0, v8, v12
+; LMULMAX8-RV32-NEXT: li a1, 64
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0
; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV32-NEXT: ret
;
@@ -1611,41 +1351,17 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
; LMULMAX8-RV64: # %bb.0:
; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 2
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 8
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 16
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: li a1, 32
-; LMULMAX8-RV64-NEXT: vsrl.vx v10, v8, a1
-; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vnot.v v8, v8
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_0)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_1)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1
-; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a2
-; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_2)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_3)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: li a1, 56
-; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vmset.m v0
+; LMULMAX8-RV64-NEXT: fsrmi a1, 1
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX8-RV64-NEXT: fsrm a1
+; LMULMAX8-RV64-NEXT: li a1, 52
+; LMULMAX8-RV64-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: li a1, 1086
+; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: li a1, 64
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0
; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV64-NEXT: ret
%a = load <4 x i64>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 144f469bf1436..4d2db34ae4cf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64I
; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32F
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64F
; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32D
; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64D
; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
@@ -181,6 +183,38 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
; LMULMAX1-RV64-NEXT: vse16.v v8, (a0)
; LMULMAX1-RV64-NEXT: ret
;
+; LMULMAX2-RV32F-LABEL: cttz_v8i16:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV32F-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV32F-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV32F-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV32F-NEXT: li a1, 127
+; LMULMAX2-RV32F-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT: li a1, 16
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
+;
+; LMULMAX2-RV64F-LABEL: cttz_v8i16:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vle16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV64F-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV64F-NEXT: vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV64F-NEXT: vnsrl.wi v9, v10, 23
+; LMULMAX2-RV64F-NEXT: li a1, 127
+; LMULMAX2-RV64F-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 16
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT: vse16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
; LMULMAX2-RV32D-LABEL: cttz_v8i16:
; LMULMAX2-RV32D: # %bb.0:
; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, ma
@@ -299,67 +333,43 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0)
; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: cttz_v4i32:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vle32.v v8, (a0)
-; LMULMAX1-RV32-NEXT: li a1, 1
-; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT: vnot.v v8, v8
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: lui a1, 349525
-; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 61681
-; LMULMAX1-RV32-NEXT: addi a1, a1, -241
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: lui a1, 4112
-; LMULMAX1-RV32-NEXT: addi a1, a1, 257
-; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: cttz_v4i32:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV32F-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT: li a1, 127
+; LMULMAX2-RV32F-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT: li a1, 32
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: cttz_v4i32:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT: vle32.v v8, (a0)
-; LMULMAX1-RV64-NEXT: li a1, 1
-; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT: lui a1, 349525
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: lui a1, 209715
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: lui a1, 61681
-; LMULMAX1-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: lui a1, 4112
-; LMULMAX1-RV64-NEXT: addiw a1, a1, 257
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT: vse32.v v8, (a0)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: cttz_v4i32:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV64F-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV64F-NEXT: li a1, 127
+; LMULMAX2-RV64F-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 32
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
;
; LMULMAX2-RV32D-LABEL: cttz_v4i32:
; LMULMAX2-RV32D: # %bb.0:
@@ -420,208 +430,197 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v2i64:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: li a1, 1
-; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v10, -1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v10, v8, v9
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT: li a1, 56
-; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: cttz_v2i64:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: li a1, 1
+; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.i v10, -1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v9, v9, v10
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v10, v8, v9
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT: li a1, 56
+; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: cttz_v2i64:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: li a1, 1
-; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT: vnot.v v8, v8
-; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_0)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_1)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a2
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI3_2)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI3_3)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: li a1, 56
-; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: cttz_v2i64:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: li a1, 1
+; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_0)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_0)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_1)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_1)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a2
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI3_2)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI3_2)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI3_3)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI3_3)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: li a1, 56
+; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: cttz_v2i64:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: li a1, 1
-; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT: lui a1, 349525
-; LMULMAX1-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 209715
-; LMULMAX1-RV32-NEXT: addi a1, a1, 819
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v10, v8, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 61681
-; LMULMAX1-RV32-NEXT: addi a1, a1, -241
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: lui a1, 4112
-; LMULMAX1-RV32-NEXT: addi a1, a1, 257
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT: li a1, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: cttz_v2i64:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vle64.v v9, (a0)
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vmv.v.i v10, 0
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vmseq.vv v8, v9, v10
+; LMULMAX2-RV32F-NEXT: vsub.vv v10, v10, v9
+; LMULMAX2-RV32F-NEXT: vand.vv v9, v9, v10
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v10, v9, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v10, 23
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vzext.vf2 v10, v9
+; LMULMAX2-RV32F-NEXT: li a1, 127
+; LMULMAX2-RV32F-NEXT: vsub.vx v9, v10, a1
+; LMULMAX2-RV32F-NEXT: li a1, 64
+; LMULMAX2-RV32F-NEXT: vmv.v.v v0, v8
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: cttz_v2i64:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX1-RV64-NEXT: li a1, 1
-; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_0)
-; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_1)
-; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a2
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI3_2)
-; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI3_3)
-; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT: li a1, 56
-; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: cttz_v2i64:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV64F-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v10, v9, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v9, v10, 23
+; LMULMAX2-RV64F-NEXT: li a1, 127
+; LMULMAX2-RV64F-NEXT: vwsubu.vx v10, v9, a1
+; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 64
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v2i64:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT: vle64.v v9, (a0)
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32D-NEXT: vmv.v.i v10, 0
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT: vmseq.vv v8, v9, v10
+; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v9
+; LMULMAX2-RV32D-NEXT: vand.vv v9, v9, v10
+; LMULMAX2-RV32D-NEXT: vmset.m v0
+; LMULMAX2-RV32D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV32D-NEXT: fsrm a1
+; LMULMAX2-RV32D-NEXT: li a1, 52
+; LMULMAX2-RV32D-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: li a1, 1023
+; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT: li a1, 64
+; LMULMAX2-RV32D-NEXT: vmv.v.v v0, v8
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v2i64:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9
+; LMULMAX2-RV64D-NEXT: vmset.m v0
+; LMULMAX2-RV64D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV64D-NEXT: fsrm a1
+; LMULMAX2-RV64D-NEXT: li a1, 52
+; LMULMAX2-RV64D-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: li a1, 1023
+; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: li a1, 64
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
;
; LMULMAX8-RV32-LABEL: cttz_v2i64:
; LMULMAX8-RV32: # %bb.0:
; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT: li a1, 1
-; LMULMAX8-RV32-NEXT: vsub.vx v9, v8, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX8-RV32-NEXT: lui a1, 349525
-; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT: vle64.v v9, (a0)
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT: vmv.v.i v10, 0
; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX8-RV32-NEXT: vmseq.vv v8, v9, v10
+; LMULMAX8-RV32-NEXT: vsub.vv v10, v10, v9
; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: lui a1, 209715
-; LMULMAX8-RV32-NEXT: addi a1, a1, 819
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9
-; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: lui a1, 61681
-; LMULMAX8-RV32-NEXT: addi a1, a1, -241
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: lui a1, 4112
-; LMULMAX8-RV32-NEXT: addi a1, a1, 257
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT: li a1, 56
-; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vmset.m v0
+; LMULMAX8-RV32-NEXT: fsrmi a1, 1
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX8-RV32-NEXT: fsrm a1
+; LMULMAX8-RV32-NEXT: li a1, 52
+; LMULMAX8-RV32-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: li a1, 1023
+; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT: li a1, 64
+; LMULMAX8-RV32-NEXT: vmv.v.v v0, v8
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0
; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV32-NEXT: ret
;
@@ -629,31 +628,19 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
; LMULMAX8-RV64: # %bb.0:
; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT: li a1, 1
-; LMULMAX8-RV64-NEXT: vsub.vx v9, v8, a1
-; LMULMAX8-RV64-NEXT: vnot.v v8, v8
-; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_0)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_1)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1
-; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1
-; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a2
-; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI3_2)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI3_3)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: li a1, 56
-; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9
+; LMULMAX8-RV64-NEXT: vmset.m v0
+; LMULMAX8-RV64-NEXT: fsrmi a1, 1
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX8-RV64-NEXT: fsrm a1
+; LMULMAX8-RV64-NEXT: li a1, 52
+; LMULMAX8-RV64-NEXT: vsrl.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: li a1, 1023
+; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: li a1, 64
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0
; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
@@ -936,165 +923,143 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v8i32:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: li a1, 1
-; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT: vnot.v v8, v8
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: cttz_v8i32:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: li a1, 1
+; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: cttz_v8i32:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vle32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: li a1, 1
-; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT: vnot.v v8, v8
-; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT: lui a1, 349525
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: lui a1, 209715
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: lui a1, 61681
-; LMULMAX2-RV64-NEXT: addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: lui a1, 4112
-; LMULMAX2-RV64-NEXT: addiw a1, a1, 257
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX2-RV64-NEXT: vse32.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: cttz_v8i32:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: li a1, 1
+; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT: lui a1, 349525
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: lui a1, 209715
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: lui a1, 61681
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: lui a1, 4112
+; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24
+; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: cttz_v8i32:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle32.v v8, (a1)
-; LMULMAX1-RV32-NEXT: vle32.v v9, (a0)
-; LMULMAX1-RV32-NEXT: li a2, 1
-; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT: vnot.v v8, v8
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT: lui a3, 349525
-; LMULMAX1-RV32-NEXT: addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: lui a4, 209715
-; LMULMAX1-RV32-NEXT: addi a4, a4, 819
-; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a4
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: lui a5, 61681
-; LMULMAX1-RV32-NEXT: addi a5, a5, -241
-; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5
-; LMULMAX1-RV32-NEXT: lui a6, 4112
-; LMULMAX1-RV32-NEXT: addi a6, a6, 257
-; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a6
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT: vsub.vx v10, v9, a2
-; LMULMAX1-RV32-NEXT: vnot.v v9, v9
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a4
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5
-; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a6
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 24
-; LMULMAX1-RV32-NEXT: vse32.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vse32.v v8, (a1)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: cttz_v8i32:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX2-RV32F-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV32F-NEXT: li a1, 127
+; LMULMAX2-RV32F-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV32F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT: li a1, 32
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: cttz_v8i32:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT: addi a1, a0, 16
-; LMULMAX1-RV64-NEXT: vle32.v v8, (a1)
-; LMULMAX1-RV64-NEXT: vle32.v v9, (a0)
-; LMULMAX1-RV64-NEXT: li a2, 1
-; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a2
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: lui a3, 349525
-; LMULMAX1-RV64-NEXT: addiw a3, a3, 1365
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: lui a4, 209715
-; LMULMAX1-RV64-NEXT: addiw a4, a4, 819
-; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: lui a5, 61681
-; LMULMAX1-RV64-NEXT: addiw a5, a5, -241
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT: lui a6, 4112
-; LMULMAX1-RV64-NEXT: addiw a6, a6, 257
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a6
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a2
-; LMULMAX1-RV64-NEXT: vnot.v v9, v9
-; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a6
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 24
-; LMULMAX1-RV64-NEXT: vse32.v v9, (a0)
-; LMULMAX1-RV64-NEXT: vse32.v v8, (a1)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: cttz_v8i32:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64F-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX2-RV64F-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV64F-NEXT: li a1, 127
+; LMULMAX2-RV64F-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 32
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64F-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v8i32:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX2-RV32D-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-RV32D-NEXT: vmset.m v0
+; LMULMAX2-RV32D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV32D-NEXT: fsrm a1
+; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV32D-NEXT: li a1, 127
+; LMULMAX2-RV32D-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT: li a1, 32
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v8i32:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX2-RV64D-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-RV64D-NEXT: vmset.m v0
+; LMULMAX2-RV64D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV64D-NEXT: fsrm a1
+; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 23
+; LMULMAX2-RV64D-NEXT: li a1, 127
+; LMULMAX2-RV64D-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: li a1, 32
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
;
; LMULMAX8-LABEL: cttz_v8i32:
; LMULMAX8: # %bb.0:
@@ -1121,244 +1086,197 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v4i64:
-; LMULMAX2-RV32: # %bb.0:
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: li a1, 1
-; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.i v12, -1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vxor.vv v8, v8, v12
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT: lui a1, 349525
-; LMULMAX2-RV32-NEXT: addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 209715
-; LMULMAX2-RV32-NEXT: addi a1, a1, 819
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v12, v8, v10
-; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v12, v8
-; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 61681
-; LMULMAX2-RV32-NEXT: addi a1, a1, -241
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: lui a1, 4112
-; LMULMAX2-RV32-NEXT: addi a1, a1, 257
-; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT: vmul.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT: li a1, 56
-; LMULMAX2-RV32-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT: ret
+; LMULMAX2-RV32I-LABEL: cttz_v4i64:
+; LMULMAX2-RV32I: # %bb.0:
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: li a1, 1
+; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.i v12, -1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vxor.vv v8, v8, v12
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT: lui a1, 349525
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v12, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v10, v10, v12
+; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 209715
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v12, v8, v10
+; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v12, v8
+; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 61681
+; LMULMAX2-RV32I-NEXT: addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: lui a1, 4112
+; LMULMAX2-RV32I-NEXT: addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT: vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT: li a1, 56
+; LMULMAX2-RV32I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT: ret
;
-; LMULMAX2-RV64-LABEL: cttz_v4i64:
-; LMULMAX2-RV64: # %bb.0:
-; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: li a1, 1
-; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT: vnot.v v8, v8
-; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_0)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_1)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a2
-; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI7_2)
-; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX2-RV64-NEXT: lui a2, %hi(.LCPI7_3)
-; LMULMAX2-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT: li a1, 56
-; LMULMAX2-RV64-NEXT: vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT: vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT: ret
+; LMULMAX2-RV64I-LABEL: cttz_v4i64:
+; LMULMAX2-RV64I: # %bb.0:
+; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64I-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: li a1, 1
+; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT: vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_0)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_0)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_1)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_1)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a2
+; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT: lui a1, %hi(.LCPI7_2)
+; LMULMAX2-RV64I-NEXT: ld a1, %lo(.LCPI7_2)(a1)
+; LMULMAX2-RV64I-NEXT: lui a2, %hi(.LCPI7_3)
+; LMULMAX2-RV64I-NEXT: ld a2, %lo(.LCPI7_3)(a2)
+; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT: li a1, 56
+; LMULMAX2-RV64I-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT: ret
;
-; LMULMAX1-RV32-LABEL: cttz_v4i64:
-; LMULMAX1-RV32: # %bb.0:
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: addi a1, a0, 16
-; LMULMAX1-RV32-NEXT: vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT: vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT: li a2, 1
-; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.i v11, -1
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vxor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT: lui a3, 349525
-; LMULMAX1-RV32-NEXT: addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v12, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v10, v10, v12
-; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: lui a3, 209715
-; LMULMAX1-RV32-NEXT: addi a3, a3, 819
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v10, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v13, v8, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v13, v8
-; LMULMAX1-RV32-NEXT: vsrl.vi v13, v8, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT: lui a3, 61681
-; LMULMAX1-RV32-NEXT: addi a3, a3, -241
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v13, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT: lui a3, 4112
-; LMULMAX1-RV32-NEXT: addi a3, a3, 257
-; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmv.v.x v14, a3
-; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT: vmul.vv v8, v8, v14
-; LMULMAX1-RV32-NEXT: li a3, 56
-; LMULMAX1-RV32-NEXT: vsrl.vx v8, v8, a3
-; LMULMAX1-RV32-NEXT: vsub.vx v15, v9, a2
-; LMULMAX1-RV32-NEXT: vxor.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT: vsrl.vi v11, v9, 1
-; LMULMAX1-RV32-NEXT: vand.vv v11, v11, v12
-; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT: vand.vv v11, v9, v10
-; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v11, v9
-; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v13
-; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v14
-; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a3
-; LMULMAX1-RV32-NEXT: vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT: vse64.v v8, (a1)
-; LMULMAX1-RV32-NEXT: ret
+; LMULMAX2-RV32F-LABEL: cttz_v4i64:
+; LMULMAX2-RV32F: # %bb.0:
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vle64.v v10, (a0)
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vmv.v.i v12, 0
+; LMULMAX2-RV32F-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vmseq.vv v8, v10, v12
+; LMULMAX2-RV32F-NEXT: vsub.vv v12, v12, v10
+; LMULMAX2-RV32F-NEXT: vand.vv v10, v10, v12
+; LMULMAX2-RV32F-NEXT: vmset.m v0
+; LMULMAX2-RV32F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT: vfncvt.f.xu.w v9, v10, v0.t
+; LMULMAX2-RV32F-NEXT: fsrm a1
+; LMULMAX2-RV32F-NEXT: vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT: vzext.vf2 v10, v9
+; LMULMAX2-RV32F-NEXT: li a1, 127
+; LMULMAX2-RV32F-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV32F-NEXT: li a1, 64
+; LMULMAX2-RV32F-NEXT: vmv1r.v v0, v8
+; LMULMAX2-RV32F-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT: ret
;
-; LMULMAX1-RV64-LABEL: cttz_v4i64:
-; LMULMAX1-RV64: # %bb.0:
-; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT: addi a1, a0, 16
-; LMULMAX1-RV64-NEXT: vle64.v v8, (a1)
-; LMULMAX1-RV64-NEXT: vle64.v v9, (a0)
-; LMULMAX1-RV64-NEXT: li a2, 1
-; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a2
-; LMULMAX1-RV64-NEXT: vnot.v v8, v8
-; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: lui a3, %hi(.LCPI7_0)
-; LMULMAX1-RV64-NEXT: ld a3, %lo(.LCPI7_0)(a3)
-; LMULMAX1-RV64-NEXT: lui a4, %hi(.LCPI7_1)
-; LMULMAX1-RV64-NEXT: ld a4, %lo(.LCPI7_1)(a4)
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4
-; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT: lui a5, %hi(.LCPI7_2)
-; LMULMAX1-RV64-NEXT: ld a5, %lo(.LCPI7_2)(a5)
-; LMULMAX1-RV64-NEXT: lui a6, %hi(.LCPI7_3)
-; LMULMAX1-RV64-NEXT: ld a6, %lo(.LCPI7_3)(a6)
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a6
-; LMULMAX1-RV64-NEXT: li a7, 56
-; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a7
-; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a2
-; LMULMAX1-RV64-NEXT: vnot.v v9, v9
-; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4
-; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a6
-; LMULMAX1-RV64-NEXT: vsrl.vx v9, v9, a7
-; LMULMAX1-RV64-NEXT: vse64.v v9, (a0)
-; LMULMAX1-RV64-NEXT: vse64.v v8, (a1)
-; LMULMAX1-RV64-NEXT: ret
+; LMULMAX2-RV64F-LABEL: cttz_v4i64:
+; LMULMAX2-RV64F: # %bb.0:
+; LMULMAX2-RV64F-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64F-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX2-RV64F-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-RV64F-NEXT: vmset.m v0
+; LMULMAX2-RV64F-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT: vfncvt.f.xu.w v12, v10, v0.t
+; LMULMAX2-RV64F-NEXT: fsrm a1
+; LMULMAX2-RV64F-NEXT: vsrl.vi v10, v12, 23
+; LMULMAX2-RV64F-NEXT: li a1, 127
+; LMULMAX2-RV64F-NEXT: vwsubu.vx v12, v10, a1
+; LMULMAX2-RV64F-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV64F-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT: li a1, 64
+; LMULMAX2-RV64F-NEXT: vmerge.vxm v8, v12, a1, v0
+; LMULMAX2-RV64F-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT: ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v4i64:
+; LMULMAX2-RV32D: # %bb.0:
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vle64.v v10, (a0)
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vmv.v.i v12, 0
+; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT: vmseq.vv v8, v10, v12
+; LMULMAX2-RV32D-NEXT: vsub.vv v12, v12, v10
+; LMULMAX2-RV32D-NEXT: vand.vv v10, v10, v12
+; LMULMAX2-RV32D-NEXT: vmset.m v0
+; LMULMAX2-RV32D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV32D-NEXT: fsrm a1
+; LMULMAX2-RV32D-NEXT: li a1, 52
+; LMULMAX2-RV32D-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT: li a1, 1023
+; LMULMAX2-RV32D-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT: li a1, 64
+; LMULMAX2-RV32D-NEXT: vmv1r.v v0, v8
+; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT: ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v4i64:
+; LMULMAX2-RV64D: # %bb.0:
+; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64D-NEXT: vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX2-RV64D-NEXT: vand.vv v10, v8, v10
+; LMULMAX2-RV64D-NEXT: vmset.m v0
+; LMULMAX2-RV64D-NEXT: fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV64D-NEXT: fsrm a1
+; LMULMAX2-RV64D-NEXT: li a1, 52
+; LMULMAX2-RV64D-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT: li a1, 1023
+; LMULMAX2-RV64D-NEXT: vsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT: li a1, 64
+; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT: vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT: ret
;
; LMULMAX8-RV32-LABEL: cttz_v4i64:
; LMULMAX8-RV32: # %bb.0:
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT: li a1, 1
-; LMULMAX8-RV32-NEXT: vsub.vx v10, v8, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.i v12, -1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v12
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX8-RV32-NEXT: lui a1, 349525
-; LMULMAX8-RV32-NEXT: addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT: vle64.v v10, (a0)
; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1
+; LMULMAX8-RV32-NEXT: vmv.v.i v12, 0
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX8-RV32-NEXT: vmseq.vv v8, v10, v12
+; LMULMAX8-RV32-NEXT: vsub.vv v12, v12, v10
; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12
-; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: lui a1, 209715
-; LMULMAX8-RV32-NEXT: addi a1, a1, 819
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10
-; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8
-; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: lui a1, 61681
-; LMULMAX8-RV32-NEXT: addi a1, a1, -241
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: lui a1, 4112
-; LMULMAX8-RV32-NEXT: addi a1, a1, 257
-; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT: li a1, 56
-; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT: vmset.m v0
+; LMULMAX8-RV32-NEXT: fsrmi a1, 1
+; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX8-RV32-NEXT: fsrm a1
+; LMULMAX8-RV32-NEXT: li a1, 52
+; LMULMAX8-RV32-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: li a1, 1023
+; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT: li a1, 64
+; LMULMAX8-RV32-NEXT: vmv1r.v v0, v8
+; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0
; LMULMAX8-RV32-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV32-NEXT: ret
;
@@ -1366,31 +1284,19 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
; LMULMAX8-RV64: # %bb.0:
; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; LMULMAX8-RV64-NEXT: vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT: li a1, 1
-; LMULMAX8-RV64-NEXT: vsub.vx v10, v8, a1
-; LMULMAX8-RV64-NEXT: vnot.v v8, v8
-; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_0)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_1)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1
-; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1
-; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a2
-; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8
-; LMULMAX8-RV64-NEXT: lui a1, %hi(.LCPI7_2)
-; LMULMAX8-RV64-NEXT: ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX8-RV64-NEXT: lui a2, %hi(.LCPI7_3)
-; LMULMAX8-RV64-NEXT: ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4
-; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT: li a1, 56
-; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0
+; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10
+; LMULMAX8-RV64-NEXT: vmset.m v0
+; LMULMAX8-RV64-NEXT: fsrmi a1, 1
+; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX8-RV64-NEXT: fsrm a1
+; LMULMAX8-RV64-NEXT: li a1, 52
+; LMULMAX8-RV64-NEXT: vsrl.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: li a1, 1023
+; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT: li a1, 64
+; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0
; LMULMAX8-RV64-NEXT: vse64.v v8, (a0)
; LMULMAX8-RV64-NEXT: ret
%a = load <4 x i64>, ptr %x
More information about the llvm-commits
mailing list