[llvm] 5280d3e - [RISCV] Teach lowerCTLZ_CTTZ_ZERO_UNDEF to handle conversion i32/i64 vectors to f32 vectors.

Yeting Kuo via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 11 22:42:55 PST 2023


Author: Yeting Kuo
Date: 2023-01-12T14:42:47+08:00
New Revision: 5280d3e7384835bb6ee797def32c98f30afaee98

URL: https://github.com/llvm/llvm-project/commit/5280d3e7384835bb6ee797def32c98f30afaee98
DIFF: https://github.com/llvm/llvm-project/commit/5280d3e7384835bb6ee797def32c98f30afaee98.diff

LOG: [RISCV] Teach lowerCTLZ_CTTZ_ZERO_UNDEF to handle conversion i32/i64 vectors to f32 vectors.

Previously lowerCTLZ_CTTZ_ZERO_UNDEF converted the source to float value by
ISD::UINT_TO_FP. ISD::UINT_TO_FP uses dynamic rounding mode, so the rounding
may make the exponent of the result not as expected when converting i32/i64 to f32.
This is the reason why we constrained lowerCTLZ_CTTZ_ZERO_UNDEF to only handle
an i32 source when the f64 type having the same element count as source is legal.

The patch teaches lowerCTLZ_CTTZ_ZERO_UNDEF converts i32/i64 vectors to f32
vectors by vfcvt.f.xu.v with RTZ rounding mode. Using RTZ is to make sure the
exponent of results is correct, although f32 could not totally represent each
value in i32/i64.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D140782

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
    llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
    llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 169ff9d22f989..5c8bd226b4db3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -677,16 +677,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       // Splice
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
 
-      // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
-      // type that can represent the value exactly.
-      if (VT.getVectorElementType() != MVT::i64) {
-        MVT FloatEltVT =
-            VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
-        EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
-        if (isTypeLegal(FloatVT)) {
-          setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
-                             Custom);
-        }
+      // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the range
+      // of f32.
+      EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+      if (isTypeLegal(FloatVT)) {
+        setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+                           Custom);
       }
     }
 
@@ -912,17 +908,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         setOperationAction(IntegerVPOps, VT, Custom);
 
-        // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point
-        // type that can represent the value exactly.
-        if (VT.getVectorElementType() != MVT::i64) {
-          MVT FloatEltVT =
-              VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32;
-          EVT FloatVT =
-              MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
-          if (isTypeLegal(FloatVT))
-            setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
-                               Custom);
-        }
+        // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
+        // range of f32.
+        EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+        if (isTypeLegal(FloatVT))
+          setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+                             Custom);
       }
 
       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
@@ -3535,15 +3526,20 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
 
 // Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting
 // the exponent.
-static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+SDValue
+RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
+                                               SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   unsigned EltSize = VT.getScalarSizeInBits();
   SDValue Src = Op.getOperand(0);
   SDLoc DL(Op);
 
-  // We need a FP type that can represent the value.
+  // We choose FP type that can represent the value if possible. Otherwise, we
+  // use rounding to zero conversion for correct exponent of the result.
   // TODO: Use f16 for i8 when possible?
-  MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32;
+  MVT FloatEltVT = (EltSize >= 32) ? MVT::f64 : MVT::f32;
+  if (!isTypeLegal(MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount())))
+    FloatEltVT = MVT::f32;
   MVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount());
 
   // Legal types should have been checked in the RISCVTargetLowering
@@ -3560,27 +3556,50 @@ static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   }
 
   // We have a legal FP type, convert to it.
-  SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+  SDValue FloatVal;
+  if (FloatVT.bitsGT(VT)) {
+    FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src);
+  } else {
+    // Use RTZ to avoid rounding influencing exponent of FloatVal.
+    MVT ContainerVT = VT;
+    if (VT.isFixedLengthVector()) {
+      ContainerVT = getContainerForFixedLengthVector(VT);
+      Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget);
+    }
+
+    auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+    SDValue RTZRM =
+        DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, Subtarget.getXLenVT());
+    MVT ContainerFloatVT =
+        MVT::getVectorVT(FloatEltVT, ContainerVT.getVectorElementCount());
+    FloatVal = DAG.getNode(RISCVISD::VFCVT_RM_F_XU_VL, DL, ContainerFloatVT,
+                           Src, Mask, RTZRM, VL);
+    if (VT.isFixedLengthVector())
+      FloatVal = convertFromScalableVector(FloatVT, FloatVal, DAG, Subtarget);
+  }
   // Bitcast to integer and shift the exponent to the LSB.
   EVT IntVT = FloatVT.changeVectorElementTypeToInteger();
   SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal);
   unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23;
-  SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
-                              DAG.getConstant(ShiftAmt, DL, IntVT));
-  // Truncate back to original type to allow vnsrl.
-  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift);
+  SDValue Exp = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast,
+                            DAG.getConstant(ShiftAmt, DL, IntVT));
+  // Restore back to original type. Truncation after SRL is to generate vnsrl.
+  if (IntVT.bitsLT(VT))
+    Exp = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Exp);
+  else if (IntVT.bitsGT(VT))
+    Exp = DAG.getNode(ISD::TRUNCATE, DL, VT, Exp);
   // The exponent contains log2 of the value in biased form.
   unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127;
 
   // For trailing zeros, we just need to subtract the bias.
   if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF)
-    return DAG.getNode(ISD::SUB, DL, VT, Trunc,
+    return DAG.getNode(ISD::SUB, DL, VT, Exp,
                        DAG.getConstant(ExponentBias, DL, VT));
 
   // For leading zeros, we need to remove the bias and convert from log2 to
   // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)).
   unsigned Adjust = ExponentBias + (EltSize - 1);
-  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc);
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Exp);
 }
 
 // While RVV has alignment restrictions, we should always be able to load as a
@@ -11571,6 +11590,28 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_MF2_MASK);
   case RISCV::PseudoVFCVT_RM_X_F_V_MF4_MASK:
     return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_MF4_MASK);
+  case RISCV::PseudoVFCVT_RM_F_XU_V_M1_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M1_MASK);
+  case RISCV::PseudoVFCVT_RM_F_XU_V_M2_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M2_MASK);
+  case RISCV::PseudoVFCVT_RM_F_XU_V_M4_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M4_MASK);
+  case RISCV::PseudoVFCVT_RM_F_XU_V_M8_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_M8_MASK);
+  case RISCV::PseudoVFCVT_RM_F_XU_V_MF2_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_MF2_MASK);
+  case RISCV::PseudoVFCVT_RM_F_XU_V_MF4_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFCVT_F_XU_V_MF4_MASK);
+  case RISCV::PseudoVFNCVT_RM_F_XU_W_M1_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M1_MASK);
+  case RISCV::PseudoVFNCVT_RM_F_XU_W_M2_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M2_MASK);
+  case RISCV::PseudoVFNCVT_RM_F_XU_W_M4_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_M4_MASK);
+  case RISCV::PseudoVFNCVT_RM_F_XU_W_MF2_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_MF2_MASK);
+  case RISCV::PseudoVFNCVT_RM_F_XU_W_MF4_MASK:
+    return emitVFCVT_RM_MASK(MI, BB, RISCV::PseudoVFNCVT_F_XU_W_MF4_MASK);
   case RISCV::PseudoVFROUND_NOEXCEPT_V_M1_MASK:
     return emitVFROUND_NOEXCEPT_MASK(MI, BB, RISCV::PseudoVFCVT_X_F_V_M1_MASK,
                                      RISCV::PseudoVFCVT_F_X_V_M1_MASK);
@@ -13167,6 +13208,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VFROUND_NOEXCEPT_VL)
   NODE_NAME_CASE(SINT_TO_FP_VL)
   NODE_NAME_CASE(UINT_TO_FP_VL)
+  NODE_NAME_CASE(VFCVT_RM_F_XU_VL)
   NODE_NAME_CASE(FP_EXTEND_VL)
   NODE_NAME_CASE(FP_ROUND_VL)
   NODE_NAME_CASE(VWMUL_VL)

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 883715a0ceec3..3de2e4dd02328 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -243,6 +243,7 @@ enum NodeType : unsigned {
   VFCVT_RM_X_F_VL, // Has a rounding mode operand.
   SINT_TO_FP_VL,
   UINT_TO_FP_VL,
+  VFCVT_RM_F_XU_VL, // Has a rounding mode operand.
   FP_ROUND_VL,
   FP_EXTEND_VL,
 
@@ -704,6 +705,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 291fdd93f0537..f2d22048babc9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -3406,6 +3406,17 @@ multiclass VPseudoVCVTF_V {
   }
 }
 
+multiclass VPseudoVCVTF_RM_V {
+  foreach m = MxListF in {
+    defvar mx = m.MX;
+    defvar WriteVFCvtIToFV_MX = !cast<SchedWrite>("WriteVFCvtIToFV_" # mx);
+    defvar ReadVFCvtIToFV_MX = !cast<SchedRead>("ReadVFCvtIToFV_" # mx);
+
+    defm _V : VPseudoConversionRM<m.vrclass, m.vrclass, m>,
+              Sched<[WriteVFCvtIToFV_MX, ReadVFCvtIToFV_MX, ReadVMask]>;
+  }
+}
+
 multiclass VPseudoConversionW_V {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxListW in
@@ -3472,6 +3483,18 @@ multiclass VPseudoVNCVTF_W {
   }
 }
 
+multiclass VPseudoVNCVTF_RM_W {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxListFW in {
+    defvar mx = m.MX;
+    defvar WriteVFNCvtIToFV_MX = !cast<SchedWrite>("WriteVFNCvtIToFV_" # mx);
+    defvar ReadVFNCvtIToFV_MX = !cast<SchedRead>("ReadVFNCvtIToFV_" # mx);
+
+    defm _W : VPseudoConversionRM<m.vrclass, m.wvrclass, m, constraint>,
+              Sched<[WriteVFNCvtIToFV_MX, ReadVFNCvtIToFV_MX, ReadVMask]>;
+  }
+}
+
 multiclass VPseudoVNCVTD_W {
   defvar constraint = "@earlyclobber $rd";
   foreach m = MxListFW in {
@@ -5495,6 +5518,7 @@ let Uses = [FRM] in {
 defm PseudoVFCVT_F_XU : VPseudoVCVTF_V;
 defm PseudoVFCVT_F_X : VPseudoVCVTF_V;
 }
+defm PseudoVFCVT_RM_F_XU : VPseudoVCVTF_RM_V;
 } // mayRaiseFPException = true
 
 //===----------------------------------------------------------------------===//
@@ -5528,6 +5552,7 @@ defm PseudoVFNCVT_F_X      : VPseudoVNCVTF_W;
 defm PseudoVFNCVT_F_F      : VPseudoVNCVTD_W;
 }
 defm PseudoVFNCVT_ROD_F_F  : VPseudoVNCVTD_W;
+defm PseudoVFNCVT_RM_F_XU  : VPseudoVNCVTF_RM_W;
 } // mayRaiseFPException = true
 } // Predicates = [HasVInstructionsAnyF]
 

diff  --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 09b94b29cef8c..bbb55f8ef257e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -140,11 +140,17 @@ def SDT_RISCVI2FPOp_VL  : SDTypeProfile<1, 3, [
   SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>,
   SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>
 ]>;
+def SDT_RISCVI2FPOp_RM_VL  : SDTypeProfile<1, 4, [
+  SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>,
+  SDTCVecEltisVT<2, i1>, SDTCisSameNumEltsAs<1, 2>, SDTCisVT<3, XLenVT>,
+  SDTCisVT<4, XLenVT>
+]>;
 
 def riscv_vfcvt_rtz_x_f_vl  : SDNode<"RISCVISD::VFCVT_RTZ_X_F_VL",  SDT_RISCVFP2IOp_VL>;
 def riscv_vfcvt_rtz_xu_f_vl : SDNode<"RISCVISD::VFCVT_RTZ_XU_F_VL", SDT_RISCVFP2IOp_VL>;
 def riscv_sint_to_fp_vl : SDNode<"RISCVISD::SINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>;
 def riscv_uint_to_fp_vl : SDNode<"RISCVISD::UINT_TO_FP_VL", SDT_RISCVI2FPOp_VL>;
+def riscv_vfcvt_rm_f_xu_vl : SDNode<"RISCVISD::VFCVT_RM_F_XU_VL", SDT_RISCVI2FPOp_RM_VL>;
 
 def SDT_RISCVVecCvtF2XOp_VL : SDTypeProfile<1, 4, [
   SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>,
@@ -796,6 +802,18 @@ multiclass VPatConvertI2FPVL_V<SDNode vop, string instruction_name> {
   }
 }
 
+multiclass VPatConvertI2FP_RM_VL_V<SDNode vop, string instruction_name> {
+  foreach fvti = AllFloatVectors in {
+    defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+    def : Pat<(fvti.Vector (vop (ivti.Vector ivti.RegClass:$rs1),
+                                (ivti.Mask V0), (XLenVT timm:$frm),
+                                VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), ivti.RegClass:$rs1,
+                  (ivti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+  }
+}
+
 multiclass VPatWConvertFP2IVL_V<SDNode vop, string instruction_name> {
   foreach fvtiToFWti = AllWidenableFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
@@ -848,6 +866,19 @@ multiclass VPatNConvertI2FPVL_V<SDNode vop, string instruction_name> {
   }
 }
 
+multiclass VPatNConvertI2FP_RM_VL_V<SDNode vop, string instruction_name> {
+  foreach fvtiToFWti = AllWidenableFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar iwti = GetIntVTypeInfo<fvtiToFWti.Wti>.Vti;
+    def : Pat<(fvti.Vector (vop (iwti.Vector iwti.RegClass:$rs1),
+                                (iwti.Mask V0),  (XLenVT timm:$frm),
+                                VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_"#fvti.LMul.MX#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), iwti.RegClass:$rs1,
+                  (iwti.Mask V0), timm:$frm, GPR:$vl, fvti.Log2SEW, TA_MA)>;
+  }
+}
+
 multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
   foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in {
     defvar vti_m1 = !cast<VTypeInfo>(!if(is_float, "VF", "VI") # vti.SEW # "M1");
@@ -1713,6 +1744,7 @@ foreach fvti = AllFloatVectors in {
   defm : VPatConvertFP2IVL_V<riscv_vfcvt_rtz_xu_f_vl, "PseudoVFCVT_RTZ_XU_F_V">;
   defm : VPatConvertI2FPVL_V<riscv_sint_to_fp_vl, "PseudoVFCVT_F_X_V">;
   defm : VPatConvertI2FPVL_V<riscv_uint_to_fp_vl, "PseudoVFCVT_F_XU_V">;
+  defm : VPatConvertI2FP_RM_VL_V<riscv_vfcvt_rm_f_xu_vl, "PseudoVFCVT_RM_F_XU_V">;
 
   // 13.18. Widening Floating-Point/Integer Type-Convert Instructions
   defm : VPatWConvertFP2IVL_V<riscv_vfcvt_rtz_x_f_vl, "PseudoVFWCVT_RTZ_X_F_V">;
@@ -1735,6 +1767,8 @@ foreach fvti = AllFloatVectors in {
   defm : VPatNConvertFP2IVL_V<riscv_vfcvt_rtz_xu_f_vl, "PseudoVFNCVT_RTZ_XU_F_W">;
   defm : VPatNConvertI2FPVL_V<riscv_sint_to_fp_vl, "PseudoVFNCVT_F_X_W">;
   defm : VPatNConvertI2FPVL_V<riscv_uint_to_fp_vl, "PseudoVFNCVT_F_XU_W">;
+  defm :
+    VPatNConvertI2FP_RM_VL_V<riscv_vfcvt_rm_f_xu_vl, "PseudoVFNCVT_RM_F_XU_W">;
   foreach fvtiToFWti = AllWidenableFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index 27250b4274956..634d0850cc45a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV32,RV32I
 ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64
 
@@ -29,6 +31,20 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv1i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv1i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -72,6 +88,20 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv2i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv2i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -115,6 +145,20 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv4i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv4i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
@@ -158,6 +202,20 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv8i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv8i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
@@ -201,6 +259,20 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv16i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv16i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -344,6 +416,18 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv1i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv1i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -427,6 +511,18 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv2i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv2i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -510,6 +606,18 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv4i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
@@ -593,6 +701,18 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv8i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v10, v10, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
@@ -676,6 +796,18 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv16i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v12, v12, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -836,6 +968,21 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv1i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv1i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -929,6 +1076,21 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv2i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv2i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
@@ -1022,6 +1184,21 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv4i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v10, v10, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv4i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
@@ -1115,6 +1292,21 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_nxv8i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v12, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_nxv8i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
@@ -1136,481 +1328,643 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 declare <vscale x 8 x i32> @llvm.ctlz.nxv8i32(<vscale x 8 x i32>, i1)
 
 define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: ctlz_nxv16i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vand.vx v16, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vmul.vx v8, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_nxv16i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vand.vx v16, v16, a0
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vx v16, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vmul.vx v8, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 24
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_nxv16i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    vmul.vx v8, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_nxv16i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 209715
+; RV64I-NEXT:    addiw a0, a0, 819
+; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    lui a0, 4112
+; RV64I-NEXT:    addiw a0, a0, 257
+; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 24
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_nxv16i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v16, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v16, v16, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_nxv16i32:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v8, v0.t
+; CHECK-D-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-D-NEXT:    li a1, 158
+; CHECK-D-NEXT:    vrsub.vx v16, v16, a1
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    li a1, 32
+; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
   ret <vscale x 16 x i32> %a
 }
 declare <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32>, i1)
 
 define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: ctlz_nxv1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 1
-; RV32-NEXT:    vand.vv v9, v11, v9
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_nxv1i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v9, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 1
+; RV32I-NEXT:    vand.vv v9, v11, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmul.vv v8, v8, v10
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_nxv1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI18_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI18_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI18_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI18_1)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI18_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI18_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI18_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI18_3)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_nxv1i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v9, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI18_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI18_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI18_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI18_1)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v9, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI18_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI18_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI18_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI18_3)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_nxv1i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v9
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v9, v10, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_nxv1i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v9, v9, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v9, v9, a1
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    li a1, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 false)
   ret <vscale x 1 x i64> %a
 }
 declare <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64>, i1)
 
 define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: ctlz_nxv2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 1
-; RV32-NEXT:    vand.vv v10, v14, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v14
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_nxv2i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v10, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 1
+; RV32I-NEXT:    vand.vv v10, v14, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmul.vv v8, v8, v12
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_nxv2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI19_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI19_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI19_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI19_1)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI19_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI19_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI19_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI19_3)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_nxv2i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v10, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI19_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_1)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v10, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI19_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI19_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_3)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_nxv2i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v10
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v10, v12, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_nxv2i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v10, v10, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v10, v10, a1
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    li a1, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 false)
   ret <vscale x 2 x i64> %a
 }
 declare <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64>, i1)
 
 define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: ctlz_nxv4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 1
-; RV32-NEXT:    vand.vv v12, v20, v12
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v20
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: ctlz_nxv4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI20_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI20_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI20_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI20_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI20_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI20_3)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
-  %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
-  ret <vscale x 4 x i64> %a
-}
-declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
-
-define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: ctlz_nxv8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_nxv4i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v12, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 1
+; RV32I-NEXT:    vand.vv v12, v20, v12
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v20
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_nxv8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI21_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI21_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI21_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI21_1)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI21_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI21_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI21_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI21_3)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_nxv4i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v12, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI20_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI20_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v12, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v12, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI20_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI20_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI20_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI20_3)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_nxv4i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v16, v12
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v12, v16, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_nxv4i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v12, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v12, v12, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    li a1, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
+  %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
+  ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
+
+define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
+; RV32I-LABEL: ctlz_nxv8i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v16, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 1
+; RV32I-NEXT:    vand.vv v16, v0, v16
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v16, v8, v24
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v0
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: ctlz_nxv8i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v16, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI21_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI21_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI21_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI21_1)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v16, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI21_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI21_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI21_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI21_3)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_nxv8i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v24, v16
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v16, v24, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_nxv8i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v16, v16, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v16, v16, a1
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    li a1, 64
+; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 false)
   ret <vscale x 8 x i64> %a
 }
@@ -1641,6 +1995,18 @@ define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -1681,6 +2047,18 @@ define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -1721,6 +2099,18 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
@@ -1761,6 +2151,18 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vrsub.vx v8, v10, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
@@ -1801,6 +2203,18 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv16i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-F-NEXT:    li a0, 134
+; CHECK-F-NEXT:    vrsub.vx v8, v12, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -1939,6 +2353,15 @@ define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -2018,6 +2441,15 @@ define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -2097,6 +2529,15 @@ define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
@@ -2176,6 +2617,15 @@ define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
@@ -2255,6 +2705,15 @@ define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv16i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -2410,6 +2869,18 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv1i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -2499,6 +2970,18 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv2i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
@@ -2588,6 +3071,18 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv4i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
@@ -2677,6 +3172,18 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: ctlz_zero_undef_nxv8i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
@@ -2694,477 +3201,609 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 }
 
 define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv16i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vand.vx v16, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vmul.vx v8, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_zero_undef_nxv16i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vand.vx v16, v16, a0
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vx v16, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vmul.vx v8, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 24
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_zero_undef_nxv16i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    vmul.vx v8, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_zero_undef_nxv16i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 209715
+; RV64I-NEXT:    addiw a0, a0, 819
+; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    lui a0, 4112
+; RV64I-NEXT:    addiw a0, a0, 257
+; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 24
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv16i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv16i32:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    li a1, 158
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctlz.nxv16i32(<vscale x 16 x i32> %va, i1 true)
   ret <vscale x 16 x i32> %a
 }
 
 define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 1
-; RV32-NEXT:    vand.vv v9, v11, v9
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    vand.vv v9, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v9, v8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_zero_undef_nxv1i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v9, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v9, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v9
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 1
+; RV32I-NEXT:    vand.vv v9, v11, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmul.vv v8, v8, v10
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_zero_undef_nxv1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vsrl.vi v9, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI40_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI40_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI40_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI40_1)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI40_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI40_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI40_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI40_3)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_zero_undef_nxv1i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vsrl.vi v9, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v9, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v9
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI40_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI40_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI40_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI40_1)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v9, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI40_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI40_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI40_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI40_3)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv1i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v9, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv1i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.ctlz.nxv1i64(<vscale x 1 x i64> %va, i1 true)
   ret <vscale x 1 x i64> %a
 }
 
 define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 1
-; RV32-NEXT:    vand.vv v10, v14, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v14
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_zero_undef_nxv2i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v10, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 1
+; RV32I-NEXT:    vand.vv v10, v14, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmul.vv v8, v8, v12
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_zero_undef_nxv2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI41_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI41_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI41_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI41_1)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI41_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI41_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI41_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI41_3)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_zero_undef_nxv2i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vsrl.vi v10, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v10, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v10
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI41_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI41_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI41_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI41_1)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v10, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI41_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI41_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI41_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI41_3)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv2i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v10, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv2i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.ctlz.nxv2i64(<vscale x 2 x i64> %va, i1 true)
   ret <vscale x 2 x i64> %a
 }
 
 define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 1
-; RV32-NEXT:    vand.vv v12, v20, v12
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v20
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_zero_undef_nxv4i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v12, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 1
+; RV32I-NEXT:    vand.vv v12, v20, v12
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v20
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_zero_undef_nxv4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vsrl.vi v12, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI42_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI42_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI42_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI42_1)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI42_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI42_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI42_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI42_3)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_zero_undef_nxv4i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vsrl.vi v12, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v12, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v12
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI42_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI42_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI42_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI42_1)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v12, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v12, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI42_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI42_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI42_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI42_3)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv4i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v12, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv4i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64> %va, i1 true)
   ret <vscale x 4 x i64> %a
 }
 
 define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: ctlz_zero_undef_nxv8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: ctlz_zero_undef_nxv8i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsrl.vx v16, v8, a0
+; RV32I-NEXT:    vor.vv v8, v8, v16
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 1
+; RV32I-NEXT:    vand.vv v16, v0, v16
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v16, v8, v24
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v24
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v0
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: ctlz_zero_undef_nxv8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI43_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI43_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI43_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI43_1)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI43_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI43_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI43_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI43_3)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: ctlz_zero_undef_nxv8i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 2
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 8
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 16
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    vsrl.vx v16, v8, a0
+; RV64I-NEXT:    vor.vv v8, v8, v16
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI43_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI43_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI43_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI43_1)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v16, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI43_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI43_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI43_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI43_3)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: ctlz_zero_undef_nxv8i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
+; CHECK-F-NEXT:    li a1, 190
+; CHECK-F-NEXT:    vrsub.vx v8, v16, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: ctlz_zero_undef_nxv8i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1086
+; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctlz.nxv8i64(<vscale x 8 x i64> %va, i1 true)
   ret <vscale x 8 x i64> %a
 }

diff  --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index fd02061ffbc15..439e63aa68243 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV32,RV32I
 ; RUN: llc -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-ZVE64X,RV64,RV64I
-; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV32,RV32F
+; RUN: llc -mtriple=riscv64 -mattr=+zve64f,+f -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-F,RV64,RV64F
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV32,RV32D
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-D,RV64,RV64D
 
 define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-LABEL: cttz_nxv1i8:
@@ -26,6 +28,23 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv1i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv1i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
@@ -69,6 +88,23 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv2i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v10
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv2i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
@@ -112,6 +148,23 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv4i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT:    vnsrl.wi v9, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv4i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
@@ -155,6 +208,23 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv8i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsub.vx v8, v9, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv8i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
@@ -198,6 +268,23 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv16i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsub.vx v8, v10, a0
+; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv16i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -328,6 +415,20 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv1i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv1i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -403,6 +504,20 @@ define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv2i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv2i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -478,6 +593,20 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv4i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
@@ -553,6 +682,20 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv8i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v10, v10, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
@@ -628,6 +771,20 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv16i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-F-NEXT:    vand.vv v12, v8, v12
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v12, v12, a0
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a0, 16
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -766,6 +923,23 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv1i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv1i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -847,6 +1021,23 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv2i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv2i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
@@ -928,6 +1119,23 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv4i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v10, v10, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv4i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
@@ -1009,6 +1217,23 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_nxv8i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-F-NEXT:    vand.vv v12, v8, v12
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v12, v12, v0.t
+; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v12, v12, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_nxv8i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
@@ -1032,387 +1257,733 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 declare <vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32>, i1)
 
 define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: cttz_nxv16i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vand.vx v16, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vmul.vx v8, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_nxv16i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vsub.vx v16, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vand.vx v16, v16, a0
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vx v16, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vmul.vx v8, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 24
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_nxv16i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    vmul.vx v8, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_nxv16i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 209715
+; RV64I-NEXT:    addiw a0, a0, 819
+; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    lui a0, 4112
+; RV64I-NEXT:    addiw a0, a0, 257
+; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 24
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: cttz_nxv16i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-F-NEXT:    vand.vv v16, v8, v16
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
+; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v16, v16, a1
+; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    li a1, 32
+; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: cttz_nxv16i32:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-D-NEXT:    vand.vv v16, v8, v16
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
+; CHECK-D-NEXT:    vsrl.vi v16, v16, 23
+; CHECK-D-NEXT:    li a1, 127
+; CHECK-D-NEXT:    vsub.vx v16, v16, a1
+; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    li a1, 32
+; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 false)
   ret <vscale x 16 x i32> %a
 }
 declare <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32>, i1)
 
 define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: cttz_nxv1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 1
-; RV32-NEXT:    vand.vv v10, v11, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_nxv1i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vx v9, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 1
+; RV32I-NEXT:    vand.vv v10, v11, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v9
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmul.vv v8, v8, v10
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_nxv1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    lui a0, %hi(.LCPI18_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI18_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI18_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI18_1)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI18_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI18_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI18_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI18_3)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_nxv1i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vsub.vx v9, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v9
+; RV64I-NEXT:    lui a0, %hi(.LCPI18_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI18_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI18_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI18_1)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v9, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI18_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI18_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI18_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI18_3)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; RV32F-LABEL: cttz_nxv1i64:
+; RV32F:       # %bb.0:
+; RV32F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32F-NEXT:    vmseq.vx v9, v8, zero
+; RV32F-NEXT:    vrsub.vi v10, v8, 0
+; RV32F-NEXT:    vand.vv v8, v8, v10
+; RV32F-NEXT:    vmset.m v0
+; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV32F-NEXT:    vfncvt.f.xu.w v10, v8, v0.t
+; RV32F-NEXT:    vsrl.vi v8, v10, 23
+; RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32F-NEXT:    vzext.vf2 v10, v8
+; RV32F-NEXT:    li a1, 127
+; RV32F-NEXT:    vsub.vx v8, v10, a1
+; RV32F-NEXT:    li a1, 64
+; RV32F-NEXT:    vmv.v.v v0, v9
+; RV32F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT:    fsrm a0
+; RV32F-NEXT:    ret
+;
+; RV64F-LABEL: cttz_nxv1i64:
+; RV64F:       # %bb.0:
+; RV64F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64F-NEXT:    vrsub.vi v9, v8, 0
+; RV64F-NEXT:    vand.vv v9, v8, v9
+; RV64F-NEXT:    vmset.m v0
+; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RV64F-NEXT:    vfncvt.f.xu.w v10, v9, v0.t
+; RV64F-NEXT:    vsrl.vi v9, v10, 23
+; RV64F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64F-NEXT:    vzext.vf2 v10, v9
+; RV64F-NEXT:    li a1, 127
+; RV64F-NEXT:    vsub.vx v9, v10, a1
+; RV64F-NEXT:    vmseq.vi v0, v8, 0
+; RV64F-NEXT:    li a1, 64
+; RV64F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RV64F-NEXT:    fsrm a0
+; RV64F-NEXT:    ret
+;
+; RV32D-LABEL: cttz_nxv1i64:
+; RV32D:       # %bb.0:
+; RV32D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32D-NEXT:    vmseq.vx v9, v8, zero
+; RV32D-NEXT:    vrsub.vi v10, v8, 0
+; RV32D-NEXT:    vand.vv v8, v8, v10
+; RV32D-NEXT:    vmset.m v0
+; RV32D-NEXT:    fsrmi a0, 1
+; RV32D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT:    li a1, 52
+; RV32D-NEXT:    vsrl.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 1023
+; RV32D-NEXT:    vsub.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 64
+; RV32D-NEXT:    vmv.v.v v0, v9
+; RV32D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT:    fsrm a0
+; RV32D-NEXT:    ret
+;
+; RV64D-LABEL: cttz_nxv1i64:
+; RV64D:       # %bb.0:
+; RV64D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64D-NEXT:    vrsub.vi v9, v8, 0
+; RV64D-NEXT:    vand.vv v9, v8, v9
+; RV64D-NEXT:    vmset.m v0
+; RV64D-NEXT:    fsrmi a0, 1
+; RV64D-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; RV64D-NEXT:    li a1, 52
+; RV64D-NEXT:    vsrl.vx v9, v9, a1
+; RV64D-NEXT:    li a1, 1023
+; RV64D-NEXT:    vsub.vx v9, v9, a1
+; RV64D-NEXT:    vmseq.vi v0, v8, 0
+; RV64D-NEXT:    li a1, 64
+; RV64D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RV64D-NEXT:    fsrm a0
+; RV64D-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 false)
   ret <vscale x 1 x i64> %a
 }
 declare <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64>, i1)
 
 define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: cttz_nxv2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 1
-; RV32-NEXT:    vand.vv v12, v14, v12
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v14
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_nxv2i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vx v10, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 1
+; RV32I-NEXT:    vand.vv v12, v14, v12
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmul.vv v8, v8, v12
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_nxv2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    lui a0, %hi(.LCPI19_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI19_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI19_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI19_1)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI19_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI19_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI19_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI19_3)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_nxv2i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vsub.vx v10, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v10
+; RV64I-NEXT:    lui a0, %hi(.LCPI19_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI19_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_1)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v10, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI19_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI19_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI19_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI19_3)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; RV32F-LABEL: cttz_nxv2i64:
+; RV32F:       # %bb.0:
+; RV32F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32F-NEXT:    vmseq.vx v10, v8, zero
+; RV32F-NEXT:    vrsub.vi v12, v8, 0
+; RV32F-NEXT:    vand.vv v8, v8, v12
+; RV32F-NEXT:    vmset.m v0
+; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32F-NEXT:    vfncvt.f.xu.w v11, v8, v0.t
+; RV32F-NEXT:    vsrl.vi v8, v11, 23
+; RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32F-NEXT:    vzext.vf2 v12, v8
+; RV32F-NEXT:    li a1, 127
+; RV32F-NEXT:    vsub.vx v8, v12, a1
+; RV32F-NEXT:    li a1, 64
+; RV32F-NEXT:    vmv1r.v v0, v10
+; RV32F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT:    fsrm a0
+; RV32F-NEXT:    ret
+;
+; RV64F-LABEL: cttz_nxv2i64:
+; RV64F:       # %bb.0:
+; RV64F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64F-NEXT:    vrsub.vi v10, v8, 0
+; RV64F-NEXT:    vand.vv v10, v8, v10
+; RV64F-NEXT:    vmset.m v0
+; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV64F-NEXT:    vfncvt.f.xu.w v12, v10, v0.t
+; RV64F-NEXT:    vsrl.vi v10, v12, 23
+; RV64F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64F-NEXT:    vzext.vf2 v12, v10
+; RV64F-NEXT:    li a1, 127
+; RV64F-NEXT:    vsub.vx v10, v12, a1
+; RV64F-NEXT:    vmseq.vi v0, v8, 0
+; RV64F-NEXT:    li a1, 64
+; RV64F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RV64F-NEXT:    fsrm a0
+; RV64F-NEXT:    ret
+;
+; RV32D-LABEL: cttz_nxv2i64:
+; RV32D:       # %bb.0:
+; RV32D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32D-NEXT:    vmseq.vx v10, v8, zero
+; RV32D-NEXT:    vrsub.vi v12, v8, 0
+; RV32D-NEXT:    vand.vv v8, v8, v12
+; RV32D-NEXT:    vmset.m v0
+; RV32D-NEXT:    fsrmi a0, 1
+; RV32D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT:    li a1, 52
+; RV32D-NEXT:    vsrl.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 1023
+; RV32D-NEXT:    vsub.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 64
+; RV32D-NEXT:    vmv1r.v v0, v10
+; RV32D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT:    fsrm a0
+; RV32D-NEXT:    ret
+;
+; RV64D-LABEL: cttz_nxv2i64:
+; RV64D:       # %bb.0:
+; RV64D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64D-NEXT:    vrsub.vi v10, v8, 0
+; RV64D-NEXT:    vand.vv v10, v8, v10
+; RV64D-NEXT:    vmset.m v0
+; RV64D-NEXT:    fsrmi a0, 1
+; RV64D-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; RV64D-NEXT:    li a1, 52
+; RV64D-NEXT:    vsrl.vx v10, v10, a1
+; RV64D-NEXT:    li a1, 1023
+; RV64D-NEXT:    vsub.vx v10, v10, a1
+; RV64D-NEXT:    vmseq.vi v0, v8, 0
+; RV64D-NEXT:    li a1, 64
+; RV64D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RV64D-NEXT:    fsrm a0
+; RV64D-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 false)
   ret <vscale x 2 x i64> %a
 }
 declare <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64>, i1)
 
 define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: cttz_nxv4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 1
-; RV32-NEXT:    vand.vv v16, v20, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v20
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_nxv4i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vx v12, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 1
+; RV32I-NEXT:    vand.vv v16, v20, v16
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v16, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v20
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_nxv4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    lui a0, %hi(.LCPI20_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI20_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI20_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI20_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI20_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI20_3)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_nxv4i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    vsub.vx v12, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    lui a0, %hi(.LCPI20_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI20_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI20_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI20_1)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v12, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v12, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI20_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI20_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI20_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI20_3)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; RV32F-LABEL: cttz_nxv4i64:
+; RV32F:       # %bb.0:
+; RV32F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32F-NEXT:    vmseq.vx v12, v8, zero
+; RV32F-NEXT:    vrsub.vi v16, v8, 0
+; RV32F-NEXT:    vand.vv v8, v8, v16
+; RV32F-NEXT:    vmset.m v0
+; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32F-NEXT:    vfncvt.f.xu.w v14, v8, v0.t
+; RV32F-NEXT:    vsrl.vi v8, v14, 23
+; RV32F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32F-NEXT:    vzext.vf2 v16, v8
+; RV32F-NEXT:    li a1, 127
+; RV32F-NEXT:    vsub.vx v8, v16, a1
+; RV32F-NEXT:    li a1, 64
+; RV32F-NEXT:    vmv1r.v v0, v12
+; RV32F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT:    fsrm a0
+; RV32F-NEXT:    ret
+;
+; RV64F-LABEL: cttz_nxv4i64:
+; RV64F:       # %bb.0:
+; RV64F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64F-NEXT:    vrsub.vi v12, v8, 0
+; RV64F-NEXT:    vand.vv v12, v8, v12
+; RV64F-NEXT:    vmset.m v0
+; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64F-NEXT:    vfncvt.f.xu.w v16, v12, v0.t
+; RV64F-NEXT:    vsrl.vi v12, v16, 23
+; RV64F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV64F-NEXT:    vzext.vf2 v16, v12
+; RV64F-NEXT:    li a1, 127
+; RV64F-NEXT:    vsub.vx v12, v16, a1
+; RV64F-NEXT:    vmseq.vi v0, v8, 0
+; RV64F-NEXT:    li a1, 64
+; RV64F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; RV64F-NEXT:    fsrm a0
+; RV64F-NEXT:    ret
+;
+; RV32D-LABEL: cttz_nxv4i64:
+; RV32D:       # %bb.0:
+; RV32D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32D-NEXT:    vmseq.vx v12, v8, zero
+; RV32D-NEXT:    vrsub.vi v16, v8, 0
+; RV32D-NEXT:    vand.vv v8, v8, v16
+; RV32D-NEXT:    vmset.m v0
+; RV32D-NEXT:    fsrmi a0, 1
+; RV32D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT:    li a1, 52
+; RV32D-NEXT:    vsrl.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 1023
+; RV32D-NEXT:    vsub.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 64
+; RV32D-NEXT:    vmv1r.v v0, v12
+; RV32D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT:    fsrm a0
+; RV32D-NEXT:    ret
+;
+; RV64D-LABEL: cttz_nxv4i64:
+; RV64D:       # %bb.0:
+; RV64D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64D-NEXT:    vrsub.vi v12, v8, 0
+; RV64D-NEXT:    vand.vv v12, v8, v12
+; RV64D-NEXT:    vmset.m v0
+; RV64D-NEXT:    fsrmi a0, 1
+; RV64D-NEXT:    vfcvt.f.xu.v v12, v12, v0.t
+; RV64D-NEXT:    li a1, 52
+; RV64D-NEXT:    vsrl.vx v12, v12, a1
+; RV64D-NEXT:    li a1, 1023
+; RV64D-NEXT:    vsub.vx v12, v12, a1
+; RV64D-NEXT:    vmseq.vi v0, v8, 0
+; RV64D-NEXT:    li a1, 64
+; RV64D-NEXT:    vmerge.vxm v8, v12, a1, v0
+; RV64D-NEXT:    fsrm a0
+; RV64D-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 false)
   ret <vscale x 4 x i64> %a
 }
 declare <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64>, i1)
 
 define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: cttz_nxv8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_nxv8i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vx v16, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 1
+; RV32I-NEXT:    vand.vv v24, v0, v24
+; RV32I-NEXT:    vsub.vv v8, v8, v24
+; RV32I-NEXT:    vand.vv v24, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v0
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_nxv8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    lui a0, %hi(.LCPI21_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI21_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI21_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI21_1)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI21_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI21_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI21_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI21_3)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_nxv8i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, %hi(.LCPI21_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI21_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI21_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI21_1)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v16, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI21_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI21_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI21_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI21_3)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; RV32F-LABEL: cttz_nxv8i64:
+; RV32F:       # %bb.0:
+; RV32F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32F-NEXT:    vmseq.vx v16, v8, zero
+; RV32F-NEXT:    vrsub.vi v24, v8, 0
+; RV32F-NEXT:    vand.vv v8, v8, v24
+; RV32F-NEXT:    vmset.m v0
+; RV32F-NEXT:    fsrmi a0, 1
+; RV32F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32F-NEXT:    vfncvt.f.xu.w v20, v8, v0.t
+; RV32F-NEXT:    vsrl.vi v8, v20, 23
+; RV32F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32F-NEXT:    vzext.vf2 v24, v8
+; RV32F-NEXT:    li a1, 127
+; RV32F-NEXT:    vsub.vx v8, v24, a1
+; RV32F-NEXT:    li a1, 64
+; RV32F-NEXT:    vmv1r.v v0, v16
+; RV32F-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32F-NEXT:    fsrm a0
+; RV32F-NEXT:    ret
+;
+; RV64F-LABEL: cttz_nxv8i64:
+; RV64F:       # %bb.0:
+; RV64F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64F-NEXT:    vrsub.vi v16, v8, 0
+; RV64F-NEXT:    vand.vv v16, v8, v16
+; RV64F-NEXT:    vmset.m v0
+; RV64F-NEXT:    fsrmi a0, 1
+; RV64F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV64F-NEXT:    vfncvt.f.xu.w v24, v16, v0.t
+; RV64F-NEXT:    vsrl.vi v16, v24, 23
+; RV64F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64F-NEXT:    vzext.vf2 v24, v16
+; RV64F-NEXT:    li a1, 127
+; RV64F-NEXT:    vsub.vx v16, v24, a1
+; RV64F-NEXT:    vmseq.vi v0, v8, 0
+; RV64F-NEXT:    li a1, 64
+; RV64F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; RV64F-NEXT:    fsrm a0
+; RV64F-NEXT:    ret
+;
+; RV32D-LABEL: cttz_nxv8i64:
+; RV32D:       # %bb.0:
+; RV32D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32D-NEXT:    vmseq.vx v16, v8, zero
+; RV32D-NEXT:    vrsub.vi v24, v8, 0
+; RV32D-NEXT:    vand.vv v8, v8, v24
+; RV32D-NEXT:    vmset.m v0
+; RV32D-NEXT:    fsrmi a0, 1
+; RV32D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; RV32D-NEXT:    li a1, 52
+; RV32D-NEXT:    vsrl.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 1023
+; RV32D-NEXT:    vsub.vx v8, v8, a1
+; RV32D-NEXT:    li a1, 64
+; RV32D-NEXT:    vmv1r.v v0, v16
+; RV32D-NEXT:    vmerge.vxm v8, v8, a1, v0
+; RV32D-NEXT:    fsrm a0
+; RV32D-NEXT:    ret
+;
+; RV64D-LABEL: cttz_nxv8i64:
+; RV64D:       # %bb.0:
+; RV64D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64D-NEXT:    vrsub.vi v16, v8, 0
+; RV64D-NEXT:    vand.vv v16, v8, v16
+; RV64D-NEXT:    vmset.m v0
+; RV64D-NEXT:    fsrmi a0, 1
+; RV64D-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
+; RV64D-NEXT:    li a1, 52
+; RV64D-NEXT:    vsrl.vx v16, v16, a1
+; RV64D-NEXT:    li a1, 1023
+; RV64D-NEXT:    vsub.vx v16, v16, a1
+; RV64D-NEXT:    vmseq.vi v0, v8, 0
+; RV64D-NEXT:    li a1, 64
+; RV64D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; RV64D-NEXT:    fsrm a0
+; RV64D-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 false)
   ret <vscale x 8 x i64> %a
 }
@@ -1440,6 +2011,21 @@ define <vscale x 1 x i8> @cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv1i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
@@ -1480,6 +2066,21 @@ define <vscale x 2 x i8> @cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv2i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
@@ -1520,6 +2121,21 @@ define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv4i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
@@ -1560,6 +2176,21 @@ define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v10, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
@@ -1600,6 +2231,21 @@ define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vand.vi v8, v8, 15
 ; CHECK-ZVE64X-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv16i8:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
+; CHECK-F-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v12, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv16i8:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -1725,6 +2371,17 @@ define <vscale x 1 x i16> @cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv1i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
@@ -1796,6 +2453,17 @@ define <vscale x 2 x i16> @cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv2i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -1867,6 +2535,17 @@ define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
@@ -1938,6 +2617,17 @@ define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
@@ -2009,6 +2699,17 @@ define <vscale x 16 x i16> @cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 8
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv16i16:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    li a0, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -2142,6 +2843,20 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv1i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -2219,6 +2934,20 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv2i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
@@ -2296,6 +3025,20 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv4i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
@@ -2373,6 +3116,20 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; RV64I-NEXT:    vsrl.vi v8, v8, 24
 ; RV64I-NEXT:    ret
 ;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
 ; CHECK-D-LABEL: cttz_zero_undef_nxv8i32:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
@@ -2392,383 +3149,539 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 }
 
 define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv16i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vand.vx v16, v16, a0
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vand.vx v16, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vand.vx v8, v8, a0
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    vmul.vx v8, v8, a0
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_zero_undef_nxv16i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vsub.vx v16, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vand.vx v16, v16, a0
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vx v16, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v16
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vand.vx v8, v8, a0
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vmul.vx v8, v8, a0
+; RV32I-NEXT:    vsrl.vi v8, v8, 24
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_zero_undef_nxv16i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    vmul.vx v8, v8, a0
-; RV64-NEXT:    vsrl.vi v8, v8, 24
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_zero_undef_nxv16i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 209715
+; RV64I-NEXT:    addiw a0, a0, 819
+; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, 61681
+; RV64I-NEXT:    addiw a0, a0, -241
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    lui a0, 4112
+; RV64I-NEXT:    addiw a0, a0, 257
+; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vsrl.vi v8, v8, 24
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv16i32:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v16
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv16i32:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v16
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    li a1, 127
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.cttz.nxv16i32(<vscale x 16 x i32> %va, i1 true)
   ret <vscale x 16 x i32> %a
 }
 
 define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv1i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 1
-; RV32-NEXT:    vand.vv v10, v11, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vlse64.v v9, (a0), zero
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v11, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v11
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vmul.vv v8, v8, v10
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_zero_undef_nxv1i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vx v9, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 1
+; RV32I-NEXT:    vand.vv v10, v11, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v9
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vlse64.v v9, (a0), zero
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v11, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v11
+; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmul.vv v8, v8, v10
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_zero_undef_nxv1i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsub.vx v9, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    lui a0, %hi(.LCPI40_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI40_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI40_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI40_1)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v9, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI40_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI40_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI40_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI40_3)(a1)
-; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_zero_undef_nxv1i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    vsub.vx v9, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v9
+; RV64I-NEXT:    lui a0, %hi(.LCPI40_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI40_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI40_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI40_1)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    vand.vx v9, v9, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v9, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI40_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI40_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI40_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI40_3)(a1)
+; RV64I-NEXT:    vsrl.vi v9, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v9
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv1i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v9, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v9, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv1i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v9
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 1 x i64> @llvm.cttz.nxv1i64(<vscale x 1 x i64> %va, i1 true)
   ret <vscale x 1 x i64> %a
 }
 
 define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv2i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 1
-; RV32-NEXT:    vand.vv v12, v14, v12
-; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vlse64.v v10, (a0), zero
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v14, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v14
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vmul.vv v8, v8, v12
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_zero_undef_nxv2i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vx v10, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 1
+; RV32I-NEXT:    vand.vv v12, v14, v12
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vlse64.v v10, (a0), zero
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v14, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v14
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmul.vv v8, v8, v12
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_zero_undef_nxv2i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsub.vx v10, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v10
-; RV64-NEXT:    lui a0, %hi(.LCPI41_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI41_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI41_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI41_1)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v10, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI41_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI41_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI41_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI41_3)(a1)
-; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_zero_undef_nxv2i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    vsub.vx v10, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v10
+; RV64I-NEXT:    lui a0, %hi(.LCPI41_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI41_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI41_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI41_1)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    vand.vx v10, v10, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v10, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI41_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI41_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI41_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI41_3)(a1)
+; RV64I-NEXT:    vsrl.vi v10, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v10
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv2i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v10, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv2i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v10
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 2 x i64> @llvm.cttz.nxv2i64(<vscale x 2 x i64> %va, i1 true)
   ret <vscale x 2 x i64> %a
 }
 
 define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv4i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 1
-; RV32-NEXT:    vand.vv v16, v20, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vlse64.v v12, (a0), zero
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v20, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v20
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vmul.vv v8, v8, v16
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_zero_undef_nxv4i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vx v12, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 1
+; RV32I-NEXT:    vand.vv v16, v20, v16
+; RV32I-NEXT:    vsub.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v16, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vadd.vv v8, v16, v8
+; RV32I-NEXT:    vlse64.v v12, (a0), zero
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v20, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v20
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_zero_undef_nxv4i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsub.vx v12, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v12
-; RV64-NEXT:    lui a0, %hi(.LCPI42_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI42_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI42_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI42_1)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v12, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI42_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI42_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI42_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI42_3)(a1)
-; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_zero_undef_nxv4i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    vsub.vx v12, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v12
+; RV64I-NEXT:    lui a0, %hi(.LCPI42_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI42_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI42_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI42_1)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    vand.vx v12, v12, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v12, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v12, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI42_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI42_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI42_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI42_3)(a1)
+; RV64I-NEXT:    vsrl.vi v12, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v12
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv4i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v12, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v12, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv4i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v12
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64> %va, i1 true)
   ret <vscale x 4 x i64> %a
 }
 
 define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
-; RV32-LABEL: cttz_zero_undef_nxv8i64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    lui a0, 4112
-; RV32-NEXT:    addi a0, a0, 257
-; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a0
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsub.vv v8, v8, v24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vlse64.v v16, (a0), zero
-; RV32-NEXT:    vlse64.v v24, (a0), zero
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vmul.vv v8, v8, v24
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    ret
+; RV32I-LABEL: cttz_zero_undef_nxv8i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    lui a0, 4112
+; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    sw a0, 8(sp)
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vx v16, v8, a0
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    addi a0, sp, 8
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 1
+; RV32I-NEXT:    vand.vv v24, v0, v24
+; RV32I-NEXT:    vsub.vv v8, v8, v24
+; RV32I-NEXT:    vand.vv v24, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vlse64.v v16, (a0), zero
+; RV32I-NEXT:    vlse64.v v24, (a0), zero
+; RV32I-NEXT:    vsrl.vi v0, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v0
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
+; RV32I-NEXT:    li a0, 56
+; RV32I-NEXT:    vsrl.vx v8, v8, a0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
 ;
-; RV64-LABEL: cttz_zero_undef_nxv8i64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a0
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v16
-; RV64-NEXT:    lui a0, %hi(.LCPI43_0)
-; RV64-NEXT:    ld a0, %lo(.LCPI43_0)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI43_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI43_1)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    vadd.vv v8, v16, v8
-; RV64-NEXT:    lui a0, %hi(.LCPI43_2)
-; RV64-NEXT:    ld a0, %lo(.LCPI43_2)(a0)
-; RV64-NEXT:    lui a1, %hi(.LCPI43_3)
-; RV64-NEXT:    ld a1, %lo(.LCPI43_3)(a1)
-; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a0
-; RV64-NEXT:    ret
+; RV64I-LABEL: cttz_zero_undef_nxv8i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    vnot.v v8, v8
+; RV64I-NEXT:    vand.vv v8, v8, v16
+; RV64I-NEXT:    lui a0, %hi(.LCPI43_0)
+; RV64I-NEXT:    ld a0, %lo(.LCPI43_0)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI43_1)
+; RV64I-NEXT:    ld a1, %lo(.LCPI43_1)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    vand.vx v16, v16, a0
+; RV64I-NEXT:    vsub.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v16, v8, a1
+; RV64I-NEXT:    vsrl.vi v8, v8, 2
+; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vadd.vv v8, v16, v8
+; RV64I-NEXT:    lui a0, %hi(.LCPI43_2)
+; RV64I-NEXT:    ld a0, %lo(.LCPI43_2)(a0)
+; RV64I-NEXT:    lui a1, %hi(.LCPI43_3)
+; RV64I-NEXT:    ld a1, %lo(.LCPI43_3)(a1)
+; RV64I-NEXT:    vsrl.vi v16, v8, 4
+; RV64I-NEXT:    vadd.vv v8, v8, v16
+; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    li a0, 56
+; RV64I-NEXT:    vsrl.vx v8, v8, a0
+; RV64I-NEXT:    ret
+;
+; CHECK-F-LABEL: cttz_zero_undef_nxv8i64:
+; CHECK-F:       # %bb.0:
+; CHECK-F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v16
+; CHECK-F-NEXT:    vmset.m v0
+; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8, v0.t
+; CHECK-F-NEXT:    vsrl.vi v8, v16, 23
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vzext.vf2 v16, v8
+; CHECK-F-NEXT:    li a1, 127
+; CHECK-F-NEXT:    vsub.vx v8, v16, a1
+; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    ret
+;
+; CHECK-D-LABEL: cttz_zero_undef_nxv8i64:
+; CHECK-D:       # %bb.0:
+; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v16
+; CHECK-D-NEXT:    vmset.m v0
+; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
+; CHECK-D-NEXT:    li a1, 1023
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
+; CHECK-D-NEXT:    fsrm a0
+; CHECK-D-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.cttz.nxv8i64(<vscale x 8 x i64> %va, i1 true)
   ret <vscale x 8 x i64> %a
 }

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index cc08aeb8cd2ba..abc68c40ad298 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64I
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32F
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64F
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32D
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64D
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
@@ -201,6 +203,34 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; LMULMAX1-RV64-NEXT:    vse16.v v8, (a0)
 ; LMULMAX1-RV64-NEXT:    ret
 ;
+; LMULMAX2-RV32F-LABEL: ctlz_v8i16:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV32F-NEXT:    vnsrl.wi v9, v10, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 142
+; LMULMAX2-RV32F-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT:    li a1, 16
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
+;
+; LMULMAX2-RV64F-LABEL: ctlz_v8i16:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vfwcvt.f.xu.v v10, v8
+; LMULMAX2-RV64F-NEXT:    vnsrl.wi v9, v10, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 142
+; LMULMAX2-RV64F-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 16
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
 ; LMULMAX2-RV32D-LABEL: ctlz_v8i16:
 ; LMULMAX2-RV32D:       # %bb.0:
 ; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
@@ -327,81 +357,39 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64I-NEXT:    vse32.v v8, (a0)
 ; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: ctlz_v4i32:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vle32.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a1, 349525
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a1
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 209715
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 61681
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    lui a1, 4112
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV32-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT:    vse32.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: ctlz_v4i32:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 158
+; LMULMAX2-RV32F-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT:    li a1, 32
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: ctlz_v4i32:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    vle32.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT:    lui a1, 349525
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    lui a1, 209715
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    lui a1, 61681
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    lui a1, 4112
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 257
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT:    vse32.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: ctlz_v4i32:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 158
+; LMULMAX2-RV64F-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 32
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
 ;
 ; LMULMAX2-RV32D-LABEL: ctlz_v4i32:
 ; LMULMAX2-RV32D:       # %bb.0:
@@ -456,258 +444,204 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
 
 define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v2i64:
-; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    li a1, 32
-; LMULMAX2-RV32-NEXT:    vsrl.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v9, -1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vxor.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    lui a1, 4112
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    li a1, 56
-; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    ret
+; LMULMAX2-RV32I-LABEL: ctlz_v2i64:
+; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 2
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 8
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 16
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    li a1, 32
+; LMULMAX2-RV32I-NEXT:    vsrl.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.i v9, -1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v9, v10
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    li a1, 56
+; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    ret
 ;
-; LMULMAX2-RV64-LABEL: ctlz_v2i64:
-; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    li a1, 32
-; LMULMAX2-RV64-NEXT:    vsrl.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI3_1)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vand.vx v9, v8, a2
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI3_3)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    li a1, 56
-; LMULMAX2-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    ret
+; LMULMAX2-RV64I-LABEL: ctlz_v2i64:
+; LMULMAX2-RV64I:       # %bb.0:
+; LMULMAX2-RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 2
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 8
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 16
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    li a1, 32
+; LMULMAX2-RV64I-NEXT:    vsrl.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI3_0)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI3_1)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT:    vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vand.vx v9, v8, a2
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI3_2)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI3_3)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    li a1, 56
+; LMULMAX2-RV64I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: ctlz_v2i64:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    li a1, 32
-; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v9, -1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vxor.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a1, 349525
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 209715
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 61681
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 4112
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    li a1, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: ctlz_v2i64:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v9, v8, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 190
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vwsubu.wv v10, v10, v9
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmv.v.i v9, 0
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmseq.vv v0, v8, v9
+; LMULMAX2-RV32F-NEXT:    li a1, 64
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: ctlz_v2i64:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    li a1, 32
-; LMULMAX1-RV64-NEXT:    vsrl.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX1-RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX1-RV64-NEXT:    lui a2, %hi(.LCPI3_1)
-; LMULMAX1-RV64-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v8, a2
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX1-RV64-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX1-RV64-NEXT:    lui a2, %hi(.LCPI3_3)
-; LMULMAX1-RV64-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT:    li a1, 56
-; LMULMAX1-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: ctlz_v2i64:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v9, v8, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 190
+; LMULMAX2-RV64F-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV64F-NEXT:    vwsubu.vv v11, v10, v9
+; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 64
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v11, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v2i64:
+; LMULMAX2-RV32D:       # %bb.0:
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    vmset.m v0
+; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV32D-NEXT:    fsrm a1
+; LMULMAX2-RV32D-NEXT:    li a1, 52
+; LMULMAX2-RV32D-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT:    li a1, 1086
+; LMULMAX2-RV32D-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmseq.vv v0, v8, v10
+; LMULMAX2-RV32D-NEXT:    li a1, 64
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v2i64:
+; LMULMAX2-RV64D:       # %bb.0:
+; LMULMAX2-RV64D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64D-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    vmset.m v0
+; LMULMAX2-RV64D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX2-RV64D-NEXT:    fsrm a1
+; LMULMAX2-RV64D-NEXT:    li a1, 52
+; LMULMAX2-RV64D-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT:    li a1, 1086
+; LMULMAX2-RV64D-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT:    li a1, 64
+; LMULMAX2-RV64D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    ret
 ;
 ; LMULMAX8-RV32-LABEL: ctlz_v2i64:
 ; LMULMAX8-RV32:       # %bb.0:
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    li a1, 32
-; LMULMAX8-RV32-NEXT:    vsrl.vx v9, v8, a1
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v9, -1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vxor.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX8-RV32-NEXT:    lui a1, 349525
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX8-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    lui a1, 209715
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    lui a1, 61681
-; LMULMAX8-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    lui a1, 4112
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 257
+; LMULMAX8-RV32-NEXT:    vmset.m v0
+; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX8-RV32-NEXT:    fsrm a1
+; LMULMAX8-RV32-NEXT:    li a1, 52
+; LMULMAX8-RV32-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT:    li a1, 1086
+; LMULMAX8-RV32-NEXT:    vrsub.vx v9, v9, a1
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v9, a1
+; LMULMAX8-RV32-NEXT:    vmv.v.i v10, 0
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmul.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    li a1, 56
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT:    vmseq.vv v0, v8, v10
+; LMULMAX8-RV32-NEXT:    li a1, 64
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV32-NEXT:    ret
 ;
@@ -715,41 +649,17 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX8-RV64:       # %bb.0:
 ; LMULMAX8-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 2
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 8
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 16
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    li a1, 32
-; LMULMAX8-RV64-NEXT:    vsrl.vx v9, v8, a1
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI3_1)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX8-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX8-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vand.vx v9, v8, a2
-; LMULMAX8-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI3_3)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    li a1, 56
-; LMULMAX8-RV64-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT:    vmset.m v0
+; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v9, v8, v0.t
+; LMULMAX8-RV64-NEXT:    fsrm a1
+; LMULMAX8-RV64-NEXT:    li a1, 52
+; LMULMAX8-RV64-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT:    li a1, 1086
+; LMULMAX8-RV64-NEXT:    vrsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT:    li a1, 64
+; LMULMAX8-RV64-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV64-NEXT:    ret
   %a = load <2 x i64>, ptr %x
@@ -1069,209 +979,149 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
 
 define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v8i32:
-; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vand.vx v10, v10, a1
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vand.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    lui a1, 4112
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT:    vse32.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    ret
-;
-; LMULMAX2-RV64-LABEL: ctlz_v8i32:
-; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vle32.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT:    lui a1, 349525
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    lui a1, 209715
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    lui a1, 61681
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    lui a1, 4112
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 257
-; LMULMAX2-RV64-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX2-RV64-NEXT:    vse32.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    ret
+; LMULMAX2-RV32I-LABEL: ctlz_v8i32:
+; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 2
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 8
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 16
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    vand.vx v10, v10, a1
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    vand.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 24
+; LMULMAX2-RV32I-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: ctlz_v8i32:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle32.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle32.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a2, 349525
-; LMULMAX1-RV32-NEXT:    addi a2, a2, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a2
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    lui a3, 209715
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v8, a3
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a3
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    lui a4, 61681
-; LMULMAX1-RV32-NEXT:    addi a4, a4, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a4
-; LMULMAX1-RV32-NEXT:    lui a5, 4112
-; LMULMAX1-RV32-NEXT:    addi a5, a5, 257
-; LMULMAX1-RV32-NEXT:    vmul.vx v8, v8, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 16
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vnot.v v9, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a2
-; LMULMAX1-RV32-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v9, a3
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a3
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a4
-; LMULMAX1-RV32-NEXT:    vmul.vx v9, v9, a5
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 24
-; LMULMAX1-RV32-NEXT:    vse32.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse32.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV64I-LABEL: ctlz_v8i32:
+; LMULMAX2-RV64I:       # %bb.0:
+; LMULMAX2-RV64I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64I-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 2
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 8
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 16
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT:    lui a1, 349525
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    lui a1, 209715
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    lui a1, 61681
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    lui a1, 4112
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, 257
+; LMULMAX2-RV64I-NEXT:    vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 24
+; LMULMAX2-RV64I-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: ctlz_v8i32:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV64-NEXT:    vle32.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    vle32.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    lui a2, 349525
-; LMULMAX1-RV64-NEXT:    addiw a2, a2, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a2
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    lui a3, 209715
-; LMULMAX1-RV64-NEXT:    addiw a3, a3, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v8, a3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a3
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    lui a4, 61681
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT:    lui a5, 4112
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, 257
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 8
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 16
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vnot.v v9, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a2
-; LMULMAX1-RV64-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v9, a3
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a3
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT:    vmul.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v9, 24
-; LMULMAX1-RV64-NEXT:    vse32.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    vse32.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: ctlz_v8i32:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 158
+; LMULMAX2-RV32F-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT:    li a1, 32
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
+;
+; LMULMAX2-RV64F-LABEL: ctlz_v8i32:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 158
+; LMULMAX2-RV64F-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 32
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v8i32:
+; LMULMAX2-RV32D:       # %bb.0:
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    vmset.m v0
+; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV32D-NEXT:    fsrm a1
+; LMULMAX2-RV32D-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV32D-NEXT:    li a1, 158
+; LMULMAX2-RV32D-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT:    li a1, 32
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v8i32:
+; LMULMAX2-RV64D:       # %bb.0:
+; LMULMAX2-RV64D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64D-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    vmset.m v0
+; LMULMAX2-RV64D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV64D-NEXT:    fsrm a1
+; LMULMAX2-RV64D-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV64D-NEXT:    li a1, 158
+; LMULMAX2-RV64D-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT:    li a1, 32
+; LMULMAX2-RV64D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    ret
 ;
 ; LMULMAX8-LABEL: ctlz_v8i32:
 ; LMULMAX8:       # %bb.0:
@@ -1296,314 +1146,204 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
 
 define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: ctlz_v4i64:
-; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    li a1, 32
-; LMULMAX2-RV32-NEXT:    vsrl.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v10, -1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 4112
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    li a1, 56
-; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    ret
+; LMULMAX2-RV32I-LABEL: ctlz_v4i64:
+; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 2
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 8
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 16
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    li a1, 32
+; LMULMAX2-RV32I-NEXT:    vsrl.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.i v10, -1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v12, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v10, v12
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v12, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v12, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    li a1, 56
+; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    ret
 ;
-; LMULMAX2-RV64-LABEL: ctlz_v4i64:
-; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    li a1, 32
-; LMULMAX2-RV64-NEXT:    vsrl.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI7_0)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI7_1)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v8, a2
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI7_2)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI7_3)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    li a1, 56
-; LMULMAX2-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    ret
+; LMULMAX2-RV64I-LABEL: ctlz_v4i64:
+; LMULMAX2-RV64I:       # %bb.0:
+; LMULMAX2-RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 2
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 8
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 16
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    li a1, 32
+; LMULMAX2-RV64I-NEXT:    vsrl.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT:    vor.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI7_0)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI7_1)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI7_1)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v8, a2
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI7_2)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI7_2)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI7_3)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI7_3)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    li a1, 56
+; LMULMAX2-RV64I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: ctlz_v4i64:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    li a2, 32
-; LMULMAX1-RV32-NEXT:    vsrl.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v10, -1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a3, 349525
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v12
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT:    lui a3, 209715
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v11, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v8, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v13, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT:    lui a3, 61681
-; LMULMAX1-RV32-NEXT:    addi a3, a3, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT:    lui a3, 4112
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 257
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v14
-; LMULMAX1-RV32-NEXT:    li a3, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a3
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v9, 1
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v9, 2
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v9, 4
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v9, 8
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vsrl.vi v15, v9, 16
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vsrl.vx v15, v9, a2
-; LMULMAX1-RV32-NEXT:    vor.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vxor.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX1-RV32-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v9, v11
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v13
-; LMULMAX1-RV32-NEXT:    vmul.vv v9, v9, v14
-; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v9, a3
-; LMULMAX1-RV32-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: ctlz_v4i64:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v8, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 190
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmv.v.x v12, a1
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vwsubu.wv v12, v12, v10
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmseq.vv v0, v8, v10
+; LMULMAX2-RV32F-NEXT:    li a1, 64
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: ctlz_v4i64:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV64-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    vle64.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    li a2, 32
-; LMULMAX1-RV64-NEXT:    vsrl.vx v10, v8, a2
-; LMULMAX1-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    lui a3, %hi(.LCPI7_0)
-; LMULMAX1-RV64-NEXT:    ld a3, %lo(.LCPI7_0)(a3)
-; LMULMAX1-RV64-NEXT:    lui a4, %hi(.LCPI7_1)
-; LMULMAX1-RV64-NEXT:    ld a4, %lo(.LCPI7_1)(a4)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v8, a4
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT:    lui a5, %hi(.LCPI7_2)
-; LMULMAX1-RV64-NEXT:    ld a5, %lo(.LCPI7_2)(a5)
-; LMULMAX1-RV64-NEXT:    lui a6, %hi(.LCPI7_3)
-; LMULMAX1-RV64-NEXT:    ld a6, %lo(.LCPI7_3)(a6)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a6
-; LMULMAX1-RV64-NEXT:    li a7, 56
-; LMULMAX1-RV64-NEXT:    vsrl.vx v8, v8, a7
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 2
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 8
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 16
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vx v10, v9, a2
-; LMULMAX1-RV64-NEXT:    vor.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vnot.v v9, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v9, a4
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT:    vmul.vx v9, v9, a6
-; LMULMAX1-RV64-NEXT:    vsrl.vx v9, v9, a7
-; LMULMAX1-RV64-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    vse64.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: ctlz_v4i64:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v10, v8, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 190
+; LMULMAX2-RV64F-NEXT:    vmv.v.x v11, a1
+; LMULMAX2-RV64F-NEXT:    vwsubu.vv v12, v11, v10
+; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 64
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
+; LMULMAX2-RV32D-LABEL: ctlz_v4i64:
+; LMULMAX2-RV32D:       # %bb.0:
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    vmset.m v0
+; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV32D-NEXT:    fsrm a1
+; LMULMAX2-RV32D-NEXT:    li a1, 52
+; LMULMAX2-RV32D-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    li a1, 1086
+; LMULMAX2-RV32D-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmv.v.i v12, 0
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmseq.vv v0, v8, v12
+; LMULMAX2-RV32D-NEXT:    li a1, 64
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    ret
+;
+; LMULMAX2-RV64D-LABEL: ctlz_v4i64:
+; LMULMAX2-RV64D:       # %bb.0:
+; LMULMAX2-RV64D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64D-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    vmset.m v0
+; LMULMAX2-RV64D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX2-RV64D-NEXT:    fsrm a1
+; LMULMAX2-RV64D-NEXT:    li a1, 52
+; LMULMAX2-RV64D-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT:    li a1, 1086
+; LMULMAX2-RV64D-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT:    li a1, 64
+; LMULMAX2-RV64D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    ret
 ;
 ; LMULMAX8-RV32-LABEL: ctlz_v4i64:
 ; LMULMAX8-RV32:       # %bb.0:
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    li a1, 32
-; LMULMAX8-RV32-NEXT:    vsrl.vx v10, v8, a1
-; LMULMAX8-RV32-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v10, -1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX8-RV32-NEXT:    lui a1, 349525
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX8-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    lui a1, 209715
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v12, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    lui a1, 61681
-; LMULMAX8-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    lui a1, 4112
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 257
+; LMULMAX8-RV32-NEXT:    vmset.m v0
+; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX8-RV32-NEXT:    fsrm a1
+; LMULMAX8-RV32-NEXT:    li a1, 52
+; LMULMAX8-RV32-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT:    li a1, 1086
+; LMULMAX8-RV32-NEXT:    vrsub.vx v10, v10, a1
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT:    vmv.v.i v12, 0
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmul.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    li a1, 56
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT:    vmseq.vv v0, v8, v12
+; LMULMAX8-RV32-NEXT:    li a1, 64
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV32-NEXT:    ret
 ;
@@ -1611,41 +1351,17 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX8-RV64:       # %bb.0:
 ; LMULMAX8-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 2
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 8
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 16
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    li a1, 32
-; LMULMAX8-RV64-NEXT:    vsrl.vx v10, v8, a1
-; LMULMAX8-RV64-NEXT:    vor.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI7_0)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI7_1)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX8-RV64-NEXT:    vand.vx v10, v10, a1
-; LMULMAX8-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vand.vx v10, v8, a2
-; LMULMAX8-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI7_2)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI7_3)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    li a1, 56
-; LMULMAX8-RV64-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT:    vmset.m v0
+; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v10, v8, v0.t
+; LMULMAX8-RV64-NEXT:    fsrm a1
+; LMULMAX8-RV64-NEXT:    li a1, 52
+; LMULMAX8-RV64-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT:    li a1, 1086
+; LMULMAX8-RV64-NEXT:    vrsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT:    li a1, 64
+; LMULMAX8-RV64-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV64-NEXT:    ret
   %a = load <4 x i64>, ptr %x

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 144f469bf1436..4d2db34ae4cf1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64I
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64x -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32F
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zve64f,+f -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64F
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV32,LMULMAX2-RV32D
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2,LMULMAX2-RV64,LMULMAX2-RV64D
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32
@@ -181,6 +183,38 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; LMULMAX1-RV64-NEXT:    vse16.v v8, (a0)
 ; LMULMAX1-RV64-NEXT:    ret
 ;
+; LMULMAX2-RV32F-LABEL: cttz_v8i16:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV32F-NEXT:    vand.vv v9, v8, v9
+; LMULMAX2-RV32F-NEXT:    vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV32F-NEXT:    vnsrl.wi v9, v10, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 127
+; LMULMAX2-RV32F-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT:    li a1, 16
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse16.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
+;
+; LMULMAX2-RV64F-LABEL: cttz_v8i16:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV64F-NEXT:    vand.vv v9, v8, v9
+; LMULMAX2-RV64F-NEXT:    vfwcvt.f.xu.v v10, v9
+; LMULMAX2-RV64F-NEXT:    vnsrl.wi v9, v10, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 127
+; LMULMAX2-RV64F-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 16
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse16.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
 ; LMULMAX2-RV32D-LABEL: cttz_v8i16:
 ; LMULMAX2-RV32D:       # %bb.0:
 ; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
@@ -299,67 +333,43 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64I-NEXT:    vse32.v v8, (a0)
 ; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: cttz_v4i32:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vle32.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    li a1, 1
-; LMULMAX1-RV32-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a1, 349525
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a1
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 209715
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 61681
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    lui a1, 4112
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV32-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT:    vse32.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: cttz_v4i32:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV32F-NEXT:    vand.vv v9, v8, v9
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 127
+; LMULMAX2-RV32F-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT:    li a1, 32
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: cttz_v4i32:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    vle32.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    li a1, 1
-; LMULMAX1-RV64-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT:    lui a1, 349525
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    lui a1, 209715
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    lui a1, 61681
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    lui a1, 4112
-; LMULMAX1-RV64-NEXT:    addiw a1, a1, 257
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT:    vse32.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: cttz_v4i32:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV64F-NEXT:    vand.vv v9, v8, v9
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 127
+; LMULMAX2-RV64F-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 32
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
 ;
 ; LMULMAX2-RV32D-LABEL: cttz_v4i32:
 ; LMULMAX2-RV32D:       # %bb.0:
@@ -420,208 +430,197 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
 
 define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v2i64:
-; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 1
-; LMULMAX2-RV32-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v10, -1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    lui a1, 4112
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v9
-; LMULMAX2-RV32-NEXT:    li a1, 56
-; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    ret
+; LMULMAX2-RV32I-LABEL: cttz_v2i64:
+; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v9, v8, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.i v10, -1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 1
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v9, v9, v10
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v8, v9
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v9, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v9
+; LMULMAX2-RV32I-NEXT:    li a1, 56
+; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    ret
 ;
-; LMULMAX2-RV64-LABEL: cttz_v2i64:
-; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX2-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    li a1, 1
-; LMULMAX2-RV64-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX2-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV64-NEXT:    vand.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI3_1)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX2-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vand.vx v9, v8, a2
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI3_3)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    li a1, 56
-; LMULMAX2-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    ret
+; LMULMAX2-RV64I-LABEL: cttz_v2i64:
+; LMULMAX2-RV64I:       # %bb.0:
+; LMULMAX2-RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    li a1, 1
+; LMULMAX2-RV64I-NEXT:    vsub.vx v9, v8, a1
+; LMULMAX2-RV64I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT:    vand.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI3_0)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI3_1)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 1
+; LMULMAX2-RV64I-NEXT:    vand.vx v9, v9, a1
+; LMULMAX2-RV64I-NEXT:    vsub.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vand.vx v9, v8, a2
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v9, v8
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI3_2)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI3_3)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v9, v8, 4
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v8, v9
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    li a1, 56
+; LMULMAX2-RV64I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: cttz_v2i64:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    li a1, 1
-; LMULMAX1-RV32-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v10, -1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a1, 349525
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 209715
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 61681
-; LMULMAX1-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    lui a1, 4112
-; LMULMAX1-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v9
-; LMULMAX1-RV32-NEXT:    li a1, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: cttz_v2i64:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle64.v v9, (a0)
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmseq.vv v8, v9, v10
+; LMULMAX2-RV32F-NEXT:    vsub.vv v10, v10, v9
+; LMULMAX2-RV32F-NEXT:    vand.vv v9, v9, v10
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v9, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v9, v10, 23
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vzext.vf2 v10, v9
+; LMULMAX2-RV32F-NEXT:    li a1, 127
+; LMULMAX2-RV32F-NEXT:    vsub.vx v9, v10, a1
+; LMULMAX2-RV32F-NEXT:    li a1, 64
+; LMULMAX2-RV32F-NEXT:    vmv.v.v v0, v8
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: cttz_v2i64:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    li a1, 1
-; LMULMAX1-RV64-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    vand.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX1-RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX1-RV64-NEXT:    lui a2, %hi(.LCPI3_1)
-; LMULMAX1-RV64-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v8, a2
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX1-RV64-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX1-RV64-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX1-RV64-NEXT:    lui a2, %hi(.LCPI3_3)
-; LMULMAX1-RV64-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX1-RV64-NEXT:    li a1, 56
-; LMULMAX1-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX1-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: cttz_v2i64:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV64F-NEXT:    vand.vv v9, v8, v9
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v10, v9, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v9, v10, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 127
+; LMULMAX2-RV64F-NEXT:    vwsubu.vx v10, v9, a1
+; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 64
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v2i64:
+; LMULMAX2-RV32D:       # %bb.0:
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT:    vle64.v v9, (a0)
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmv.v.i v10, 0
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmseq.vv v8, v9, v10
+; LMULMAX2-RV32D-NEXT:    vsub.vv v10, v10, v9
+; LMULMAX2-RV32D-NEXT:    vand.vv v9, v9, v10
+; LMULMAX2-RV32D-NEXT:    vmset.m v0
+; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV32D-NEXT:    fsrm a1
+; LMULMAX2-RV32D-NEXT:    li a1, 52
+; LMULMAX2-RV32D-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT:    li a1, 1023
+; LMULMAX2-RV32D-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV32D-NEXT:    li a1, 64
+; LMULMAX2-RV32D-NEXT:    vmv.v.v v0, v8
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV32D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v2i64:
+; LMULMAX2-RV64D:       # %bb.0:
+; LMULMAX2-RV64D-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX2-RV64D-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX2-RV64D-NEXT:    vand.vv v9, v8, v9
+; LMULMAX2-RV64D-NEXT:    vmset.m v0
+; LMULMAX2-RV64D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX2-RV64D-NEXT:    fsrm a1
+; LMULMAX2-RV64D-NEXT:    li a1, 52
+; LMULMAX2-RV64D-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT:    li a1, 1023
+; LMULMAX2-RV64D-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX2-RV64D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT:    li a1, 64
+; LMULMAX2-RV64D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    ret
 ;
 ; LMULMAX8-RV32-LABEL: cttz_v2i64:
 ; LMULMAX8-RV32:       # %bb.0:
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    li a1, 1
-; LMULMAX8-RV32-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v10, -1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vxor.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX8-RV32-NEXT:    lui a1, 349525
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT:    vle64.v v9, (a0)
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
+; LMULMAX8-RV32-NEXT:    vmv.v.i v10, 0
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX8-RV32-NEXT:    vmseq.vv v8, v9, v10
+; LMULMAX8-RV32-NEXT:    vsub.vv v10, v10, v9
 ; LMULMAX8-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX8-RV32-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    lui a1, 209715
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v10, v8, v9
-; LMULMAX8-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX8-RV32-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    lui a1, 61681
-; LMULMAX8-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    lui a1, 4112
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v9, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX8-RV32-NEXT:    vmul.vv v8, v8, v9
-; LMULMAX8-RV32-NEXT:    li a1, 56
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT:    vmset.m v0
+; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX8-RV32-NEXT:    fsrm a1
+; LMULMAX8-RV32-NEXT:    li a1, 52
+; LMULMAX8-RV32-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT:    li a1, 1023
+; LMULMAX8-RV32-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX8-RV32-NEXT:    li a1, 64
+; LMULMAX8-RV32-NEXT:    vmv.v.v v0, v8
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV32-NEXT:    ret
 ;
@@ -629,31 +628,19 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX8-RV64:       # %bb.0:
 ; LMULMAX8-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    li a1, 1
-; LMULMAX8-RV64-NEXT:    vsub.vx v9, v8, a1
-; LMULMAX8-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX8-RV64-NEXT:    vand.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI3_0)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI3_1)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI3_1)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 1
-; LMULMAX8-RV64-NEXT:    vand.vx v9, v9, a1
-; LMULMAX8-RV64-NEXT:    vsub.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vand.vx v9, v8, a2
-; LMULMAX8-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v9, v8
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI3_2)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI3_2)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI3_3)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI3_3)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v9, v8, 4
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v8, v9
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    li a1, 56
-; LMULMAX8-RV64-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT:    vrsub.vi v9, v8, 0
+; LMULMAX8-RV64-NEXT:    vand.vv v9, v8, v9
+; LMULMAX8-RV64-NEXT:    vmset.m v0
+; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v9, v9, v0.t
+; LMULMAX8-RV64-NEXT:    fsrm a1
+; LMULMAX8-RV64-NEXT:    li a1, 52
+; LMULMAX8-RV64-NEXT:    vsrl.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT:    li a1, 1023
+; LMULMAX8-RV64-NEXT:    vsub.vx v9, v9, a1
+; LMULMAX8-RV64-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT:    li a1, 64
+; LMULMAX8-RV64-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV64-NEXT:    ret
   %a = load <2 x i64>, ptr %x
@@ -936,165 +923,143 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
 
 define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v8i32:
-; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vle32.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 1
-; LMULMAX2-RV32-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vand.vx v10, v10, a1
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vand.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    lui a1, 4112
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX2-RV32-NEXT:    vse32.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    ret
+; LMULMAX2-RV32I-LABEL: cttz_v8i32:
+; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    vand.vx v10, v10, a1
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    vand.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    vmul.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 24
+; LMULMAX2-RV32I-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    ret
 ;
-; LMULMAX2-RV64-LABEL: cttz_v8i32:
-; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vle32.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    li a1, 1
-; LMULMAX2-RV64-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV64-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT:    lui a1, 349525
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 1365
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    lui a1, 209715
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 819
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    lui a1, 61681
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, -241
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    lui a1, 4112
-; LMULMAX2-RV64-NEXT:    addiw a1, a1, 257
-; LMULMAX2-RV64-NEXT:    vmul.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX2-RV64-NEXT:    vse32.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    ret
+; LMULMAX2-RV64I-LABEL: cttz_v8i32:
+; LMULMAX2-RV64I:       # %bb.0:
+; LMULMAX2-RV64I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64I-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    li a1, 1
+; LMULMAX2-RV64I-NEXT:    vsub.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT:    lui a1, 349525
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, 1365
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    lui a1, 209715
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, 819
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    lui a1, 61681
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, -241
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    lui a1, 4112
+; LMULMAX2-RV64I-NEXT:    addiw a1, a1, 257
+; LMULMAX2-RV64I-NEXT:    vmul.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 24
+; LMULMAX2-RV64I-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: cttz_v8i32:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle32.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle32.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    li a2, 1
-; LMULMAX1-RV32-NEXT:    vsub.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a3, 349525
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    lui a4, 209715
-; LMULMAX1-RV32-NEXT:    addi a4, a4, 819
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v8, a4
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    lui a5, 61681
-; LMULMAX1-RV32-NEXT:    addi a5, a5, -241
-; LMULMAX1-RV32-NEXT:    vand.vx v8, v8, a5
-; LMULMAX1-RV32-NEXT:    lui a6, 4112
-; LMULMAX1-RV32-NEXT:    addi a6, a6, 257
-; LMULMAX1-RV32-NEXT:    vmul.vx v8, v8, a6
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV32-NEXT:    vsub.vx v10, v9, a2
-; LMULMAX1-RV32-NEXT:    vnot.v v9, v9
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV32-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vx v10, v9, a4
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a4
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vx v9, v9, a5
-; LMULMAX1-RV32-NEXT:    vmul.vx v9, v9, a6
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 24
-; LMULMAX1-RV32-NEXT:    vse32.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse32.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: cttz_v8i32:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV32F-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV32F-NEXT:    li a1, 127
+; LMULMAX2-RV32F-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV32F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32F-NEXT:    li a1, 32
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: cttz_v8i32:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV64-NEXT:    vle32.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    vle32.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    li a2, 1
-; LMULMAX1-RV64-NEXT:    vsub.vx v10, v8, a2
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    vand.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    lui a3, 349525
-; LMULMAX1-RV64-NEXT:    addiw a3, a3, 1365
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    lui a4, 209715
-; LMULMAX1-RV64-NEXT:    addiw a4, a4, 819
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v8, a4
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    lui a5, 61681
-; LMULMAX1-RV64-NEXT:    addiw a5, a5, -241
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT:    lui a6, 4112
-; LMULMAX1-RV64-NEXT:    addiw a6, a6, 257
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a6
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 24
-; LMULMAX1-RV64-NEXT:    vsub.vx v10, v9, a2
-; LMULMAX1-RV64-NEXT:    vnot.v v9, v9
-; LMULMAX1-RV64-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v9, a4
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT:    vmul.vx v9, v9, a6
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v9, 24
-; LMULMAX1-RV64-NEXT:    vse32.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    vse32.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: cttz_v8i32:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV64F-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 127
+; LMULMAX2-RV64F-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 32
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v8i32:
+; LMULMAX2-RV32D:       # %bb.0:
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV32D-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV32D-NEXT:    vmset.m v0
+; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV32D-NEXT:    fsrm a1
+; LMULMAX2-RV32D-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV32D-NEXT:    li a1, 127
+; LMULMAX2-RV32D-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV32D-NEXT:    li a1, 32
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v8i32:
+; LMULMAX2-RV64D:       # %bb.0:
+; LMULMAX2-RV64D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV64D-NEXT:    vle32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV64D-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV64D-NEXT:    vmset.m v0
+; LMULMAX2-RV64D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV64D-NEXT:    fsrm a1
+; LMULMAX2-RV64D-NEXT:    vsrl.vi v10, v10, 23
+; LMULMAX2-RV64D-NEXT:    li a1, 127
+; LMULMAX2-RV64D-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT:    li a1, 32
+; LMULMAX2-RV64D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT:    vse32.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    ret
 ;
 ; LMULMAX8-LABEL: cttz_v8i32:
 ; LMULMAX8:       # %bb.0:
@@ -1121,244 +1086,197 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
 
 define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
-; LMULMAX2-RV32-LABEL: cttz_v4i64:
-; LMULMAX2-RV32:       # %bb.0:
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    li a1, 1
-; LMULMAX2-RV32-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.i v12, -1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vxor.vv v8, v8, v12
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV32-NEXT:    lui a1, 349525
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 1365
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v12, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX2-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 209715
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v12, v8, v10
-; LMULMAX2-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX2-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 61681
-; LMULMAX2-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    lui a1, 4112
-; LMULMAX2-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX2-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV32-NEXT:    vmul.vv v8, v8, v10
-; LMULMAX2-RV32-NEXT:    li a1, 56
-; LMULMAX2-RV32-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV32-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV32-NEXT:    ret
+; LMULMAX2-RV32I-LABEL: cttz_v4i64:
+; LMULMAX2-RV32I:       # %bb.0:
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    li a1, 1
+; LMULMAX2-RV32I-NEXT:    vsub.vx v10, v8, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.i v12, -1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vxor.vv v8, v8, v12
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV32I-NEXT:    lui a1, 349525
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 1365
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v12, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v10, v10, v12
+; LMULMAX2-RV32I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 209715
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 819
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v12, v8, v10
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v12, v8
+; LMULMAX2-RV32I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV32I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 61681
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, -241
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    lui a1, 4112
+; LMULMAX2-RV32I-NEXT:    addi a1, a1, 257
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32I-NEXT:    vmul.vv v8, v8, v10
+; LMULMAX2-RV32I-NEXT:    li a1, 56
+; LMULMAX2-RV32I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV32I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32I-NEXT:    ret
 ;
-; LMULMAX2-RV64-LABEL: cttz_v4i64:
-; LMULMAX2-RV64:       # %bb.0:
-; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX2-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    li a1, 1
-; LMULMAX2-RV64-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX2-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX2-RV64-NEXT:    vand.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI7_0)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI7_1)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v10, a1
-; LMULMAX2-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vand.vx v10, v8, a2
-; LMULMAX2-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX2-RV64-NEXT:    lui a1, %hi(.LCPI7_2)
-; LMULMAX2-RV64-NEXT:    ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX2-RV64-NEXT:    lui a2, %hi(.LCPI7_3)
-; LMULMAX2-RV64-NEXT:    ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX2-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX2-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX2-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX2-RV64-NEXT:    li a1, 56
-; LMULMAX2-RV64-NEXT:    vsrl.vx v8, v8, a1
-; LMULMAX2-RV64-NEXT:    vse64.v v8, (a0)
-; LMULMAX2-RV64-NEXT:    ret
+; LMULMAX2-RV64I-LABEL: cttz_v4i64:
+; LMULMAX2-RV64I:       # %bb.0:
+; LMULMAX2-RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64I-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    li a1, 1
+; LMULMAX2-RV64I-NEXT:    vsub.vx v10, v8, a1
+; LMULMAX2-RV64I-NEXT:    vnot.v v8, v8
+; LMULMAX2-RV64I-NEXT:    vand.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI7_0)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI7_1)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI7_1)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 1
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v10, a1
+; LMULMAX2-RV64I-NEXT:    vsub.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vand.vx v10, v8, a2
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v8, v8, 2
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v10, v8
+; LMULMAX2-RV64I-NEXT:    lui a1, %hi(.LCPI7_2)
+; LMULMAX2-RV64I-NEXT:    ld a1, %lo(.LCPI7_2)(a1)
+; LMULMAX2-RV64I-NEXT:    lui a2, %hi(.LCPI7_3)
+; LMULMAX2-RV64I-NEXT:    ld a2, %lo(.LCPI7_3)(a2)
+; LMULMAX2-RV64I-NEXT:    vsrl.vi v10, v8, 4
+; LMULMAX2-RV64I-NEXT:    vadd.vv v8, v8, v10
+; LMULMAX2-RV64I-NEXT:    vand.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vmul.vx v8, v8, a2
+; LMULMAX2-RV64I-NEXT:    li a1, 56
+; LMULMAX2-RV64I-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX2-RV64I-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64I-NEXT:    ret
 ;
-; LMULMAX1-RV32-LABEL: cttz_v4i64:
-; LMULMAX1-RV32:       # %bb.0:
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV32-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    vle64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    li a2, 1
-; LMULMAX1-RV32-NEXT:    vsub.vx v10, v8, a2
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.i v11, -1
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vxor.vv v8, v8, v11
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV32-NEXT:    lui a3, 349525
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 1365
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v12, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX1-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    lui a3, 209715
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 819
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v10, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v13, v8, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v13, v8
-; LMULMAX1-RV32-NEXT:    vsrl.vi v13, v8, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT:    lui a3, 61681
-; LMULMAX1-RV32-NEXT:    addi a3, a3, -241
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v13, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vand.vv v8, v8, v13
-; LMULMAX1-RV32-NEXT:    lui a3, 4112
-; LMULMAX1-RV32-NEXT:    addi a3, a3, 257
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmv.v.x v14, a3
-; LMULMAX1-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV32-NEXT:    vmul.vv v8, v8, v14
-; LMULMAX1-RV32-NEXT:    li a3, 56
-; LMULMAX1-RV32-NEXT:    vsrl.vx v8, v8, a3
-; LMULMAX1-RV32-NEXT:    vsub.vx v15, v9, a2
-; LMULMAX1-RV32-NEXT:    vxor.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v15
-; LMULMAX1-RV32-NEXT:    vsrl.vi v11, v9, 1
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v11, v12
-; LMULMAX1-RV32-NEXT:    vsub.vv v9, v9, v11
-; LMULMAX1-RV32-NEXT:    vand.vv v11, v9, v10
-; LMULMAX1-RV32-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v11, v9
-; LMULMAX1-RV32-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV32-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV32-NEXT:    vand.vv v9, v9, v13
-; LMULMAX1-RV32-NEXT:    vmul.vv v9, v9, v14
-; LMULMAX1-RV32-NEXT:    vsrl.vx v9, v9, a3
-; LMULMAX1-RV32-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV32-NEXT:    vse64.v v8, (a1)
-; LMULMAX1-RV32-NEXT:    ret
+; LMULMAX2-RV32F-LABEL: cttz_v4i64:
+; LMULMAX2-RV32F:       # %bb.0:
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vle64.v v10, (a0)
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmv.v.i v12, 0
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vmseq.vv v8, v10, v12
+; LMULMAX2-RV32F-NEXT:    vsub.vv v12, v12, v10
+; LMULMAX2-RV32F-NEXT:    vand.vv v10, v10, v12
+; LMULMAX2-RV32F-NEXT:    vmset.m v0
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v9, v10, v0.t
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v9, v9, 23
+; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vzext.vf2 v10, v9
+; LMULMAX2-RV32F-NEXT:    li a1, 127
+; LMULMAX2-RV32F-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV32F-NEXT:    li a1, 64
+; LMULMAX2-RV32F-NEXT:    vmv1r.v v0, v8
+; LMULMAX2-RV32F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32F-NEXT:    ret
 ;
-; LMULMAX1-RV64-LABEL: cttz_v4i64:
-; LMULMAX1-RV64:       # %bb.0:
-; LMULMAX1-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-RV64-NEXT:    addi a1, a0, 16
-; LMULMAX1-RV64-NEXT:    vle64.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    vle64.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    li a2, 1
-; LMULMAX1-RV64-NEXT:    vsub.vx v10, v8, a2
-; LMULMAX1-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX1-RV64-NEXT:    vand.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    lui a3, %hi(.LCPI7_0)
-; LMULMAX1-RV64-NEXT:    ld a3, %lo(.LCPI7_0)(a3)
-; LMULMAX1-RV64-NEXT:    lui a4, %hi(.LCPI7_1)
-; LMULMAX1-RV64-NEXT:    ld a4, %lo(.LCPI7_1)(a4)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v8, a4
-; LMULMAX1-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX1-RV64-NEXT:    lui a5, %hi(.LCPI7_2)
-; LMULMAX1-RV64-NEXT:    ld a5, %lo(.LCPI7_2)(a5)
-; LMULMAX1-RV64-NEXT:    lui a6, %hi(.LCPI7_3)
-; LMULMAX1-RV64-NEXT:    ld a6, %lo(.LCPI7_3)(a6)
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v8, v8, a5
-; LMULMAX1-RV64-NEXT:    vmul.vx v8, v8, a6
-; LMULMAX1-RV64-NEXT:    li a7, 56
-; LMULMAX1-RV64-NEXT:    vsrl.vx v8, v8, a7
-; LMULMAX1-RV64-NEXT:    vsub.vx v10, v9, a2
-; LMULMAX1-RV64-NEXT:    vnot.v v9, v9
-; LMULMAX1-RV64-NEXT:    vand.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 1
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v10, a3
-; LMULMAX1-RV64-NEXT:    vsub.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v10, v9, a4
-; LMULMAX1-RV64-NEXT:    vsrl.vi v9, v9, 2
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v10, v9
-; LMULMAX1-RV64-NEXT:    vsrl.vi v10, v9, 4
-; LMULMAX1-RV64-NEXT:    vadd.vv v9, v9, v10
-; LMULMAX1-RV64-NEXT:    vand.vx v9, v9, a5
-; LMULMAX1-RV64-NEXT:    vmul.vx v9, v9, a6
-; LMULMAX1-RV64-NEXT:    vsrl.vx v9, v9, a7
-; LMULMAX1-RV64-NEXT:    vse64.v v9, (a0)
-; LMULMAX1-RV64-NEXT:    vse64.v v8, (a1)
-; LMULMAX1-RV64-NEXT:    ret
+; LMULMAX2-RV64F-LABEL: cttz_v4i64:
+; LMULMAX2-RV64F:       # %bb.0:
+; LMULMAX2-RV64F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV64F-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV64F-NEXT:    vmset.m v0
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v12, v10, v0.t
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v10, v12, 23
+; LMULMAX2-RV64F-NEXT:    li a1, 127
+; LMULMAX2-RV64F-NEXT:    vwsubu.vx v12, v10, a1
+; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; LMULMAX2-RV64F-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64F-NEXT:    li a1, 64
+; LMULMAX2-RV64F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; LMULMAX2-RV64F-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    ret
+;
+; LMULMAX2-RV32D-LABEL: cttz_v4i64:
+; LMULMAX2-RV32D:       # %bb.0:
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vle64.v v10, (a0)
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmv.v.i v12, 0
+; LMULMAX2-RV32D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV32D-NEXT:    vmseq.vv v8, v10, v12
+; LMULMAX2-RV32D-NEXT:    vsub.vv v12, v12, v10
+; LMULMAX2-RV32D-NEXT:    vand.vv v10, v10, v12
+; LMULMAX2-RV32D-NEXT:    vmset.m v0
+; LMULMAX2-RV32D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV32D-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV32D-NEXT:    fsrm a1
+; LMULMAX2-RV32D-NEXT:    li a1, 52
+; LMULMAX2-RV32D-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    li a1, 1023
+; LMULMAX2-RV32D-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV32D-NEXT:    li a1, 64
+; LMULMAX2-RV32D-NEXT:    vmv1r.v v0, v8
+; LMULMAX2-RV32D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV32D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV32D-NEXT:    ret
+;
+; LMULMAX2-RV64D-LABEL: cttz_v4i64:
+; LMULMAX2-RV64D:       # %bb.0:
+; LMULMAX2-RV64D-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX2-RV64D-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX2-RV64D-NEXT:    vand.vv v10, v8, v10
+; LMULMAX2-RV64D-NEXT:    vmset.m v0
+; LMULMAX2-RV64D-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64D-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX2-RV64D-NEXT:    fsrm a1
+; LMULMAX2-RV64D-NEXT:    li a1, 52
+; LMULMAX2-RV64D-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT:    li a1, 1023
+; LMULMAX2-RV64D-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX2-RV64D-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX2-RV64D-NEXT:    li a1, 64
+; LMULMAX2-RV64D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; LMULMAX2-RV64D-NEXT:    vse64.v v8, (a0)
+; LMULMAX2-RV64D-NEXT:    ret
 ;
 ; LMULMAX8-RV32-LABEL: cttz_v4i64:
 ; LMULMAX8-RV32:       # %bb.0:
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV32-NEXT:    li a1, 1
-; LMULMAX8-RV32-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.i v12, -1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vxor.vv v8, v8, v12
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX8-RV32-NEXT:    lui a1, 349525
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 1365
+; LMULMAX8-RV32-NEXT:    vle64.v v10, (a0)
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v12, a1
+; LMULMAX8-RV32-NEXT:    vmv.v.i v12, 0
 ; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; LMULMAX8-RV32-NEXT:    vmseq.vv v8, v10, v12
+; LMULMAX8-RV32-NEXT:    vsub.vv v12, v12, v10
 ; LMULMAX8-RV32-NEXT:    vand.vv v10, v10, v12
-; LMULMAX8-RV32-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    lui a1, 209715
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 819
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v12, v8, v10
-; LMULMAX8-RV32-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v12, v8
-; LMULMAX8-RV32-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX8-RV32-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    lui a1, 61681
-; LMULMAX8-RV32-NEXT:    addi a1, a1, -241
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    lui a1, 4112
-; LMULMAX8-RV32-NEXT:    addi a1, a1, 257
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmv.v.x v10, a1
-; LMULMAX8-RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; LMULMAX8-RV32-NEXT:    vmul.vv v8, v8, v10
-; LMULMAX8-RV32-NEXT:    li a1, 56
-; LMULMAX8-RV32-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV32-NEXT:    vmset.m v0
+; LMULMAX8-RV32-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV32-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX8-RV32-NEXT:    fsrm a1
+; LMULMAX8-RV32-NEXT:    li a1, 52
+; LMULMAX8-RV32-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT:    li a1, 1023
+; LMULMAX8-RV32-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX8-RV32-NEXT:    li a1, 64
+; LMULMAX8-RV32-NEXT:    vmv1r.v v0, v8
+; LMULMAX8-RV32-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; LMULMAX8-RV32-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV32-NEXT:    ret
 ;
@@ -1366,31 +1284,19 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX8-RV64:       # %bb.0:
 ; LMULMAX8-RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX8-RV64-NEXT:    vle64.v v8, (a0)
-; LMULMAX8-RV64-NEXT:    li a1, 1
-; LMULMAX8-RV64-NEXT:    vsub.vx v10, v8, a1
-; LMULMAX8-RV64-NEXT:    vnot.v v8, v8
-; LMULMAX8-RV64-NEXT:    vand.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI7_0)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI7_0)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI7_1)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI7_1)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 1
-; LMULMAX8-RV64-NEXT:    vand.vx v10, v10, a1
-; LMULMAX8-RV64-NEXT:    vsub.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vand.vx v10, v8, a2
-; LMULMAX8-RV64-NEXT:    vsrl.vi v8, v8, 2
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v10, v8
-; LMULMAX8-RV64-NEXT:    lui a1, %hi(.LCPI7_2)
-; LMULMAX8-RV64-NEXT:    ld a1, %lo(.LCPI7_2)(a1)
-; LMULMAX8-RV64-NEXT:    lui a2, %hi(.LCPI7_3)
-; LMULMAX8-RV64-NEXT:    ld a2, %lo(.LCPI7_3)(a2)
-; LMULMAX8-RV64-NEXT:    vsrl.vi v10, v8, 4
-; LMULMAX8-RV64-NEXT:    vadd.vv v8, v8, v10
-; LMULMAX8-RV64-NEXT:    vand.vx v8, v8, a1
-; LMULMAX8-RV64-NEXT:    vmul.vx v8, v8, a2
-; LMULMAX8-RV64-NEXT:    li a1, 56
-; LMULMAX8-RV64-NEXT:    vsrl.vx v8, v8, a1
+; LMULMAX8-RV64-NEXT:    vrsub.vi v10, v8, 0
+; LMULMAX8-RV64-NEXT:    vand.vv v10, v8, v10
+; LMULMAX8-RV64-NEXT:    vmset.m v0
+; LMULMAX8-RV64-NEXT:    fsrmi a1, 1
+; LMULMAX8-RV64-NEXT:    vfcvt.f.xu.v v10, v10, v0.t
+; LMULMAX8-RV64-NEXT:    fsrm a1
+; LMULMAX8-RV64-NEXT:    li a1, 52
+; LMULMAX8-RV64-NEXT:    vsrl.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT:    li a1, 1023
+; LMULMAX8-RV64-NEXT:    vsub.vx v10, v10, a1
+; LMULMAX8-RV64-NEXT:    vmseq.vi v0, v8, 0
+; LMULMAX8-RV64-NEXT:    li a1, 64
+; LMULMAX8-RV64-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; LMULMAX8-RV64-NEXT:    vse64.v v8, (a0)
 ; LMULMAX8-RV64-NEXT:    ret
   %a = load <4 x i64>, ptr %x


        


More information about the llvm-commits mailing list